lilac: Lilac#

class lilac.Config#

Configures a set of datasets for a lilac instance.

param clusters: list[ClusterConfig] = []#

The set of datasets and paths to compute clusters for.

param concept_model_cache_embeddings: list[str] = []#

The set of embeddings to compute model caches for for every concept.

param datasets: list[DatasetConfig] = []#

The configurations for the datasets in the project.

param signals: list[Signal] = []#

The signals to run for every dataset.

param use_garden: bool = False#

Accelerate computation by running remotely on Lilac Garden. Signals, embeddings, and clusters will be run remotely if they support Lilac Garden.

model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

class lilac.DatasetConfig#

Configures a dataset with a source and transformations.

param embeddings: list[EmbeddingConfig] = []#

The embedding configs for the dataset.

param name: str [Required]#

The name of the dataset.

param namespace: str [Required]#

The namespace of the dataset.

param settings: DatasetSettings | None = None#

Dataset settings.

param signals: list[SignalConfig] = []#

The signal configs for the dataset

param source: Source [Required]#

The source configuration. This config determines where data is loaded from for the dataset.

param tags: list[str] | None = []#

[Deprecated] This field is deprecated in favor of DatasetSettings.tags and will be removed in a later release.

model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

class lilac.DatasetSettings#

The persistent settings for a dataset.

param preferred_embedding: str | None = 'gte-small'#
param tags: list[str] | None = []#

A list of tags for the dataset to organize in the UI.

param ui: DatasetUISettings = DatasetUISettings(media_paths=[], markdown_paths=[], view_type='single_item', label_to_keycode={})#
model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

class lilac.DatasetUISettings#

The UI persistent settings for a dataset.

param label_to_keycode: dict[str, str] = {}#
param markdown_paths: list[tuple[str, ...]] = []#
param media_paths: list[tuple[str, ...]] = []#
param view_type: Literal['scroll', 'single_item'] = 'single_item'#
serialize_markdown_paths(markdown_paths: list[tuple[str, ...]]) list[str | list[str]]#

Serialize markdown paths.

serialize_media_paths(media_paths: list[tuple[str, ...]]) list[str | list[str]]#

Serialize media paths.

model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

class lilac.EmbeddingConfig#

Configures an embedding on a source path.

param embedding: str [Required]#
param path: tuple[str, ...] [Required]#
serialize_path(path: tuple[str, ...]) str | list[str]#

Serialize a path.

model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

class lilac.Field#

Holds information for a field in the schema.

param bins: list[tuple[str, float | int | None, float | int | None]] | None = None#
param categorical: bool | None = None#
param cluster: ClusterInfo | None = None#
param dtype: MapType | DataType | None = None#
param embedding: EmbeddingInfo | None = None#
param fields: dict[str, Field] | None = None#
param label: str | None = None#
param map: MapInfo | None = None#
param repeated_field: Field | None = None#
param signal: dict[str, Any] | None = None#
model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

class lilac.LilacEnvironment#

Lilac environment variables.

These can be set with operating system environment variables to override behavior.

For python, see: https://docs.python.org/3/library/os.html#os.environ

For bash, see: https://www.gnu.org/software/bash/manual/bash.html#Environment

param COHERE_API_KEY: str [Required]#

The Cohere API key, used for computing cohere embeddings.

param DEBUG: str [Required]#

Turn on Lilac debug mode to log queries and timing information.

param DISABLE_LOGS: str [Required]#

Disable log() statements to the console.

param GCS_ACCESS_KEY: str [Required]#

The GCS access key for GCS operations.

param GCS_REGION: str [Required]#

The GCS region for GCS operations.

param GCS_SECRET_KEY: str [Required]#

The GCS secret key for GCS operations.

param GOOGLE_ANALYTICS_ENABLED: str [Required]#

Set to to true to enable Google analytics.

param GOOGLE_CLIENT_ID: str [Required]#

The Google OAuth client ID. Required when LILAC_AUTH_ENABLED=true. Details can be found at https://developers.google.com/identity/protocols/oauth2.

param GOOGLE_CLIENT_SECRET: str [Required]#

The Google OAuth client secret. Details can be found at https://developers.google.com/identity/protocols/oauth2.

param HF_ACCESS_TOKEN: str [Required]#

The HuggingFace access token, used for downloading data to a space from a private dataset. This is also required if the HuggingFace space is private.

param LILAC_AUTH_ADMIN_EMAILS: str [Required]#

A comma-separated list of Google emails that are allowed full edit-access, as if the LILAC_AUTH_ENABLED environment flag was disabled. These email addresses are used in concert with the GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET environment flags to authenticate users.

param LILAC_AUTH_ENABLED: str [Required]#

Set to true to enable read-only mode, disabling the ability to add datasets & compute dataset signals. When enabled, GOOGLE_CLIENT_ID, GOOGLE_CLIENT_SECRET and LILAC_OAUTH_SECRET_KEY should also be set.

param LILAC_AUTH_USER_DISABLE_LABEL_ALL: str [Required]#

Set to true to disable non-admin users to use the label-all feature in the UI.

param LILAC_AUTH_USER_EDIT_LABELS: str [Required]#

Set to true to allow non-admin users to edit labels.

param LILAC_DATA_PATH: str [Required]#

[Deprecated] The Lilac data path where datasets, concepts, caches are stored. This is deprecated in favor of LILAC_PROJECT_DIR, but will work for backwards compat.

param LILAC_DISABLE_ERROR_NOTIFICATIONS: str [Required]#

Set lilac in production mode. This will disable error messages in the UI.

param LILAC_LOAD_ON_START_SERVER: str [Required]#

When true, will load from lilac.yml upon startup.

param LILAC_OAUTH_SECRET_KEY: str [Required]#

The Google OAuth random secret key. Details can be found at https://developers.google.com/identity/protocols/oauth2.

param LILAC_PROJECT_DIR: str [Required]#

The Lilac project directory where datasets, concepts, caches are stored.This replaces LILAC_PROJECT_DIR, which is deprecated but as the same functionality. This can be set with set_project_dir.

param LILAC_USE_TABLE_INDEX: str [Required]#

Use persistent tables with rowid indexes.

param OPENAI_API_KEY: str [Required]#

The OpenAI API key, used for computing openai embeddings and generating positive examples for concept seeding.

param S3_ACCESS_KEY: str [Required]#

The S3 access key for S3 operations.

param S3_ENDPOINT: str [Required]#

The S3 endpoint URL for S3-like operations, including GCS and Azure.

param S3_REGION: str [Required]#

The S3 region for S3 operations.

param S3_SECRET_KEY: str [Required]#

The S3 secret key for S3 operations.

param USE_TABLE_INDEX: str [Required]#

Use persistent tables with rowid indexes. NOTE: This is deprecated in favor of USE_TABLE_INDEX.

model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

class lilac.SignalConfig#

Configures a signal on a source path.

param path: tuple[str, ...] [Required]#
param signal: Signal [Required]#
serialize_path(path: tuple[str, ...]) str | list[str]#

Serialize a path.

model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

class lilac.Source#

Interface for sources to implement. A source processes a set of shards and writes files.

load_to_parquet(output_dir: str, task_id: str | None) SourceManifest#

Process the source by directly writing a parquet file.

You should only override one of yield_items or load_to_parquet.

This fast path exists for sources where we are able to avoid the overhead of creating python dicts for every row by using non-Python parquet writers like DuckDB.

The output parquet files should have a {schema.ROWID} column defined. This ROWID should not be part of source_schema, however.

Finally, self.source_schema().num_items is usually computed in setup(), but can be left as None for sources implementing load_to_parquet, since this count is only used to display a progress bar. load_to_parquet doesn’t have progress bar support so the count is unnecessary. However, you must still keep track of how many items were processed in total, because fields like self.sample_size should reflect the actual size of the dataset if len(dataset) < sample_size.

Parameters:
  • output_dir – The directory to write the parquet files to.

  • task_id – The TaskManager id for this task. This is used to update the progress of the task.

Returns:

A SourceManifest that describes schema and parquet file locations.

serialize_model(serializer: Callable[[...], dict[str, Any]]) dict[str, Any]#

Serialize the model to a dictionary.

setup() None#

Prepare the source for processing.

This allows the source to do setup outside the constructor, but before its processed. This avoids potentially expensive computation the pydantic model is deserialized.

source_schema() SourceSchema#

Return the source schema for this source.

Returns:

A SourceSchema with

fields: mapping top-level columns to fields that describes the schema of the source. num_items: the number of items in the source, used for progress.

teardown() None#

Tears down the source after processing.

yield_items() Iterable[Any]#

Process the source by yielding individual rows of the source data.

You should only override one of yield_items or load_to_parquet.

This method is easier to use, and simply requires you to return an iterator of Python dicts. Lilac will take your iterable of items and handle writing it to parquet. You will still have to override source_schema.

model_computed_fields: ClassVar[dict[str, ComputedFieldInfo]] = {}#

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

name: ClassVar[str]#
router: ClassVar[APIRouter | None] = None#
class lilac.SpanVector#

A span with a vector.

lilac.chunk_embedding(start: int, end: int, embedding: ndarray | None) Any#

Creates a lilac chunk embedding: a vector with a pointer to a chunk of text.

Parameters:
  • start – The start character of of the chunk with respect to the original text.

  • end – The end character of the chunk with respect to the original text.

  • embedding – The embedding vector for the chunk.

lilac.create_dataset(config: DatasetConfig, project_dir: str | Path | None = None, overwrite: bool = False) Dataset#

Load a dataset from a given source configuration.

Parameters:
  • config – The dataset configuration to load.

  • project_dir – The path to the project directory for where to create the dataset. If not defined, uses the project directory from LILAC_PROJECT_DIR or [deprecated] LILAC_DATA_PATH.

  • overwrite – Whether to overwrite the dataset if it already exists.

lilac.deploy_config(hf_space: str, config: Config, create_space: bool | None = False, hf_space_storage: Literal['small'] | Literal['medium'] | Literal['large'] | None = None, hf_token: str | None = None) str#

Deploys a Lilac config object to a HuggingFace Space.

Data will be loaded on the HuggingFace space.

Parameters:
  • hf_space – The HuggingFace space to deploy to. Should be in the format “org_name/space_name”.

  • config – The lilac config object to deploy.

  • create_space – When True, creates the HuggingFace space if it doesnt exist. The space will be created with the storage type defined by –hf_space_storage.

  • hf_space_storage – If defined, sets the HuggingFace space persistent storage type. NOTE: This only actually sets the space storage type when creating the space. For more details, see https://huggingface.co/docs/hub/spaces-storage

  • hf_token – The HuggingFace access token to use when making datasets private. This can also be set via the HF_ACCESS_TOKEN environment flag.

lilac.deploy_project(hf_space: str, project_config: Config | None = None, project_dir: str | None = None, concepts: list[str] | None = None, skip_concept_upload: bool | None = False, deploy_at_head: bool = False, skip_ts_build: bool = False, create_space: bool | None = False, load_on_space: bool | None = False, hf_space_storage: Literal['small'] | Literal['medium'] | Literal['large'] | None = None, hf_token: str | None = None) str#

Deploy a project to huggingface.

Parameters:
  • hf_space – The huggingface space. Should be formatted like SPACE_ORG/SPACE_NAME.

  • project_config – A project config for the space; defaults to config file found in project_dir.

  • project_dir – The project directory to grab data from. Defaults to env.LILAC_PROJECT_DIR.

  • concepts – The names of concepts to upload. Defaults to all concepts.

  • skip_concept_upload – When true, skips uploading concepts.

  • deploy_at_head – If true, deploys the latest code from your machine. Otherwise, deploys from PyPI’s latest published package.

  • skip_ts_build – (Only relevant when deploy_at_head=True) - Skips building frontend assets.

  • create_space – When True, creates the HuggingFace space if it doesnt exist. The space will be created with the storage type defined by –hf_space_storage.

  • load_on_space – When True, loads the datasets from your project in the space and does not upload data. NOTE: This could be expensive if your project config locally has embeddings as they will be recomputed in HuggingFace.

  • hf_space_storage – If defined, sets the HuggingFace space persistent storage type. NOTE: This only actually sets the space storage type when creating the space. For more details, see https://huggingface.co/docs/hub/spaces-storage

  • hf_token – The HuggingFace access token to upload so that the space can access private datasets.

lilac.download(url_or_repo: str, project_dir: str | None = None, dataset_namespace: str | None = 'local', dataset_name: str | None = None, hf_token: str | None = None, overwrite: bool | None = False) None#

Download a Lilac dataset from HuggingFace.

Parameters:
  • url_or_repo – A remote URL to a Lilac-processed dataset. Currently only supports HuggingFace dataset URLs. Can be a full URL: https://huggingface.co/datasets/lilacai/lilac-OpenOrca or a repo_id: lilacai/lilac-OpenOrca.

  • project_dir – The project directory to use for the demo. Defaults to env.LILAC_PROJECT_DIR which can be set with ll.set_project_dir().

  • dataset_namespace – The local namespace to use. Defaults to ‘local’.

  • dataset_name – The local dataset name to use. Defaults to the name of the HuggingFace dataset.

  • hf_token – The HuggingFace access token to use when making datasets private. This can also be set via the HF_ACCESS_TOKEN environment flag.

  • overwrite – Whether to overwrite the dataset if it already exists.

lilac.from_dicts(namespace: str, name: str, items: Iterable[Any], overwrite: bool = False) Dataset#

Load a dataset from an iterable of python dictionaries.

lilac.from_huggingface(dataset: str | Dataset | DatasetDict, namespace: str = 'local', name: str | None = None, overwrite: bool = False) Dataset#

Load a dataset from HuggingFace.

Parameters:
  • dataset – A HuggingFace dataset or its name registered on the hub.

  • namespace – The Lilac namespace for the loaded dataset. Defaults to local.

  • name – The Lilac name of the dataset to create. Defaults to the name of the HuggingFace dataset.

  • overwrite – Whether to overwrite the dataset if it already exists.

lilac.get_dataset(namespace: str, dataset_name: str, project_dir: Path | str | None = None) Dataset#

Get the dataset instance.

lilac.get_project_dir() str#

Return the base path for data.

lilac.has_dataset(namespace: str, dataset_name: str, project_dir: Path | str | None = None) bool#

Get the dataset instance.

lilac.init(project_dir: Path | str | None = None) None#

Initializes a project.

lilac.list_datasets(project_dir: Path | str | None = None) list[DatasetInfo]#

List the datasets in a project directory.

lilac.load(project_dir: str | Path | None = None, config: str | Path | Config | None = None, overwrite: bool = False) None#

Load a project from a project configuration.

Parameters:
  • project_dir – The path to the project directory for where to create the dataset. If not defined, uses the project directory from LILAC_PROJECT_DIR or [deprecated] LILAC_DATA_PATH. The project_dir can be set globally with set_project_dir.

  • config – A Lilac config or the path to a json or yml file describing the configuration. The contents should be an instance of lilac.Config or lilac.DatasetConfig. When not defined, uses LILAC_PROJECT_DIR/lilac.yml.

  • overwrite – When True, runs all data from scratch, overwriting existing data. When false, only load new datasets, embeddings, and signals.

  • execution_type – The execution type for the task manager. Can be ‘processes’ or ‘threads’.

lilac.register_embedding(embedding_cls: Type[TextEmbeddingSignal], exists_ok: bool = False) None#

Register an embedding in the global registry.

lilac.register_signal(signal_cls: Type[Signal], exists_ok: bool = False) None#

Register a signal in the global registry.

Parameters:
  • signal_cls – The signal class to register.

  • exists_ok – Whether to allow overwriting an existing signal.

lilac.set_project_dir(project_dir: str | Path) None#

Set the project directory.

lilac.span(start: int, end: int, metadata: dict[str, Any] = {}) Any#

Creates a lilac span item, representing a pointer to a slice of text.

lilac.start_server(host: str = '127.0.0.1', port: int = 5432, open: bool = False, project_dir: str = '', load: bool = False) Server#

Starts the Lilac web server.

Parameters:
  • host – The host to run the server on.

  • port – The port to run the server on.

  • open – Whether to open a browser tab upon startup.

  • project_dir – The path to the Lilac project directory. If not specified, the LILAC_PROJECT_DIR environment variable will be used (this can be set from set_project_dir). If LILAC_PROJECT_DIR is not defined, will start in the current directory.

  • load – Whether to load from the lilac.yml when the server boots up. This will diff the config with the fields that are computed and compute them when the server boots up.

lilac.stop_server() None#

Stops the Lilac web server.

lilac.upload(dataset: str, project_dir: str | None = None, url_or_repo: str | None = None, public: bool | None = False, readme_suffix: str | None = None, hf_token: str | None = None) None#

Uploads local datasets to HuggingFace datasets.

Parameters:
  • project_dir – The project directory to use for the demo. Defaults to env.LILAC_PROJECT_DIR which can be set with ll.set_project_dir().

  • dataset – The dataset to upload. Can be a local dataset name, or a namespace/dataset name.

  • url_or_repo – The HuggingFace dataset repo ID to use. If not specified, will be automatically generated.

  • public – Whether to make the dataset public. Defaults to False.

  • readme_suffix – A suffix to add to the README.md file. Defaults to None.

  • hf_token – The HuggingFace access token to use when making datasets private. This can also be set via the HF_ACCESS_TOKEN environment flag.