Skip to main content
Chroma provides comprehensive type definitions for type-safe interactions with the API.

Core Types

IDs

Record identifiers.
from chromadb.api.types import ID, IDs

# Single ID
id: ID = "doc-123"

# Multiple IDs
ids: IDs = ["doc-1", "doc-2", "doc-3"]
ID
str
A single record identifier.
IDs
List[str]
A list of record identifiers.

Documents

Text documents.
from chromadb.api.types import Document, Documents

# Single document
doc: Document = "This is a document"

# Multiple documents
docs: Documents = ["doc 1", "doc 2", "doc 3"]
Document
str
A single text document.
Documents
List[str]
A list of text documents.

Embeddings

Vector embeddings.
from chromadb.api.types import Embedding, Embeddings
import numpy as np

# Single embedding (as list)
embedding: Embedding = np.array([0.1, 0.2, 0.3], dtype=np.float32)

# Multiple embeddings
embeddings: Embeddings = [
    np.array([0.1, 0.2, 0.3], dtype=np.float32),
    np.array([0.4, 0.5, 0.6], dtype=np.float32)
]
Embedding
ndarray
A single embedding vector as numpy array.
Embeddings
List[ndarray]
A list of embedding vectors.
PyEmbedding
List[float]
A single embedding as Python list (alternative format).
PyEmbeddings
List[List[float]]
Multiple embeddings as nested Python lists.

Metadata

Record metadata.
from chromadb.api.types import Metadata, Metadatas

# Single metadata
metadata: Metadata = {
    "title": "Document Title",
    "author": "John Doe",
    "year": 2024,
    "score": 0.95,
    "published": True,
    "tags": ["AI", "ML", "NLP"]
}

# Multiple metadatas
metadatas: Metadatas = [
    {"type": "article", "year": 2023},
    {"type": "blog", "year": 2024}
]
Metadata
Dict[str, Union[str, int, float, bool, List, SparseVector]]
Metadata dictionary. Values can be:
  • Strings
  • Numbers (int or float)
  • Booleans
  • Lists of strings, numbers, or booleans (homogeneous)
  • SparseVector objects
Metadatas
List[Metadata]
A list of metadata dictionaries.
UpdateMetadata
Dict[str, Union[str, int, float, bool, List, SparseVector, None]]
Metadata for updates. Same as Metadata but allows None values to unset fields.

SparseVector

Sparse vector representation.
from chromadb.api.types import SparseVector

# Create sparse vector
sparse_vec = SparseVector(
    indices=[0, 5, 10],
    values=[0.8, 0.6, 0.9]
)

# Use in metadata
metadata = {
    "sparse_embedding": sparse_vec
}
indices
List[int]
Non-zero indices (must be sorted in ascending order).
values
List[float]
Corresponding values (must be same length as indices).
Indices must be non-negative, sorted, and have the same length as values.

URIs

Uniform Resource Identifiers for external data.
from chromadb.api.types import URI, URIs

# Single URI
uri: URI = "https://example.com/image.jpg"

# Multiple URIs
uris: URIs = [
    "https://example.com/image1.jpg",
    "https://example.com/image2.jpg"
]
URI
str
A single URI string.
URIs
List[str]
A list of URI strings.

Query Filters

Where

Metadata filter for querying.
from chromadb.api.types import Where

# Simple equality
where: Where = {"author": "John Doe"}

# Comparison operators
where: Where = {"year": {"$gte": 2020}}

# Logical operators
where: Where = {
    "$and": [
        {"category": "science"},
        {"year": {"$gte": 2020}}
    ]
}

# Array membership
where: Where = {"tags": {"$contains": "AI"}}
Where
Dict
Metadata filter using MongoDB-style query operators:Comparison Operators:
  • {"field": value} - Equality (shorthand)
  • {"field": {"$eq": value}} - Equality
  • {"field": {"$ne": value}} - Not equal
  • {"field": {"$gt": value}} - Greater than
  • {"field": {"$gte": value}} - Greater than or equal
  • {"field": {"$lt": value}} - Less than
  • {"field": {"$lte": value}} - Less than or equal
Set Operators:
  • {"field": {"$in": [values]}} - In list
  • {"field": {"$nin": [values]}} - Not in list
Array Operators:
  • {"field": {"$contains": value}} - Array contains value
  • {"field": {"$not_contains": value}} - Array does not contain value
Logical Operators:
  • {"$and": [conditions]} - All conditions must match
  • {"$or": [conditions]} - Any condition must match

WhereDocument

Document content filter for querying.
from chromadb.api.types import WhereDocument

# Substring match
where_doc: WhereDocument = {"$contains": "machine learning"}

# Regex match
where_doc: WhereDocument = {"$regex": "^Chapter [0-9]+"}

# Logical operators
where_doc: WhereDocument = {
    "$and": [
        {"$contains": "AI"},
        {"$not_contains": "deprecated"}
    ]
}
WhereDocument
Dict
Document content filter using operators:String Operators:
  • {"$contains": "text"} - Document contains substring
  • {"$not_contains": "text"} - Document does not contain substring
  • {"$regex": "pattern"} - Document matches regex
  • {"$not_regex": "pattern"} - Document does not match regex
Logical Operators:
  • {"$and": [conditions]} - All conditions must match
  • {"$or": [conditions]} - Any condition must match

Result Types

GetResult

Result from collection.get() operations.
from chromadb.api.types import GetResult

results: GetResult = collection.get(
    ids=["id1", "id2"],
    include=["documents", "metadatas"]
)

# Access results
for id, doc, meta in zip(results["ids"], results["documents"], results["metadatas"]):
    print(f"ID: {id}")
    print(f"Document: {doc}")
    print(f"Metadata: {meta}")
ids
List[str]
List of record IDs (always included).
documents
List[str]
List of documents (if included).
metadatas
List[Dict]
List of metadata dictionaries (if included).
embeddings
List[ndarray]
List of embeddings (if included).
uris
List[str]
List of URIs (if included).
included
List[str]
List of fields that were included in the query.

QueryResult

Result from collection.query() operations.
from chromadb.api.types import QueryResult

results: QueryResult = collection.query(
    query_texts=["query1", "query2"],
    n_results=5
)

# Access batched results
for batch_ids, batch_docs, batch_distances in zip(
    results["ids"],
    results["documents"],
    results["distances"]
):
    print(f"Batch results: {len(batch_ids)} records")
    for id, doc, dist in zip(batch_ids, batch_docs, batch_distances):
        print(f"  {id}: {doc} (distance: {dist})")
ids
List[List[str]]
Nested list of IDs (one list per query).
documents
List[List[str]]
Nested list of documents (if included).
metadatas
List[List[Dict]]
Nested list of metadata dictionaries (if included).
embeddings
List[List[ndarray]]
Nested list of embeddings (if included).
distances
List[List[float]]
Nested list of distances (if included).
included
List[str]
List of fields that were included in the query.

SearchResult

Result from collection.search() operations (experimental).
from chromadb.api.types import SearchResult
from chromadb.execution.expression import Search, K, Knn

results: SearchResult = collection.search(
    Search().rank(Knn(query=[0.1, 0.2])).limit(10)
)

# Convert to row format
for payload_rows in results.rows():
    for row in payload_rows:
        print(f"ID: {row['id']}")
        if 'document' in row:
            print(f"Document: {row['document']}")
        if 'score' in row:
            print(f"Score: {row['score']}")
ids
List[List[str]]
Nested list of IDs (one list per search).
documents
List[List[str]]
Nested list of documents (if selected).
embeddings
List[List[List[float]]]
Nested list of embeddings (if selected).
metadatas
List[List[Dict]]
Nested list of metadata dictionaries (if selected).
scores
List[List[float]]
Nested list of ranking scores (if scoring is used).
select
List[List[str]]
List of selected keys for each payload.
Methods:
  • rows(): Convert column-major format to row-major format, returning List[List[SearchResultRow]]

IndexingStatus

Indexing progress information.
from chromadb.api.types import IndexingStatus

status: IndexingStatus = collection.get_indexing_status()

print(f"Indexed: {status.num_indexed_ops}")
print(f"Unindexed: {status.num_unindexed_ops}")
print(f"Total: {status.total_ops}")
print(f"Progress: {status.op_indexing_progress:.2%}")
num_indexed_ops
int
Number of user operations that have been indexed.
num_unindexed_ops
int
Number of user operations pending indexing.
total_ops
int
Total number of user operations in the collection.
op_indexing_progress
float
Proportion of operations indexed (value between 0.0 and 1.0).

Schema Configuration Types

Schema

Collection schema for indexing and encryption configuration.
from chromadb import Schema, VectorIndexConfig, HnswIndexConfig, FtsIndexConfig

schema = Schema()

# Configure vector index
schema.create_index(
    config=VectorIndexConfig(
        space="cosine",
        hnsw=HnswIndexConfig(ef_construction=200)
    )
)

# Enable full-text search on documents
schema.create_index(
    config=FtsIndexConfig(),
    key="#document"
)

collection = client.create_collection(
    name="my_collection",
    schema=schema
)

VectorIndexConfig

Configuration for dense vector indexes.
from chromadb import VectorIndexConfig, HnswIndexConfig

config = VectorIndexConfig(
    space="cosine",
    embedding_function=my_embedding_function,
    hnsw=HnswIndexConfig(
        ef_construction=200,
        max_neighbors=16
    )
)
space
'cosine' | 'l2' | 'ip'
Distance metric: "cosine", "l2" (Euclidean), or "ip" (inner product).
embedding_function
EmbeddingFunction
Embedding function for the index.
source_key
str
Source key to extract vectors from. Defaults to "#document" for the default embedding.
hnsw
HnswIndexConfig
HNSW algorithm configuration.
spann
SpannIndexConfig
SPANN algorithm configuration.

HnswIndexConfig

Hierarchical Navigable Small World (HNSW) algorithm configuration.
from chromadb import HnswIndexConfig

hnsw = HnswIndexConfig(
    ef_construction=200,
    max_neighbors=16,
    ef_search=100
)
ef_construction
int
Size of candidate list during index construction. Higher values improve quality but increase build time.
max_neighbors
int
Maximum number of neighbors per node (M parameter). Higher values improve recall but increase memory usage.
Size of candidate list during search. Higher values improve recall but increase query time.
num_threads
int
Number of threads for index construction.
batch_size
int
Batch size for index operations.
sync_threshold
int
Threshold for syncing index to disk.
resize_factor
float
Factor for resizing the index.

FtsIndexConfig

Full-text search index configuration.
from chromadb import FtsIndexConfig

fts = FtsIndexConfig()
FtsIndexConfig has no configurable parameters. Use it to enable full-text search on document fields.

SparseVectorIndexConfig

Sparse vector index configuration.
from chromadb import SparseVectorIndexConfig

sparse_config = SparseVectorIndexConfig(
    embedding_function=my_sparse_embedding_function,
    source_key="#document",
    bm25=True
)
embedding_function
SparseEmbeddingFunction
Sparse embedding function for the index.
source_key
str
Source key to extract sparse vectors from.
bm25
bool
Enable BM25 weighting for sparse vectors.

Inverted Index Configs

Configuration for inverted indexes on metadata fields.
from chromadb import (
    StringInvertedIndexConfig,
    IntInvertedIndexConfig,
    FloatInvertedIndexConfig,
    BoolInvertedIndexConfig
)

# All have no configurable parameters
string_idx = StringInvertedIndexConfig()
int_idx = IntInvertedIndexConfig()
float_idx = FloatInvertedIndexConfig()
bool_idx = BoolInvertedIndexConfig()
These indexes are automatically enabled on metadata fields by default. No configuration required.

Embedding Functions

EmbeddingFunction

Protocol for implementing custom embedding functions.
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
import numpy as np

class MyEmbeddingFunction(EmbeddingFunction[Documents]):
    def __call__(self, input: Documents) -> Embeddings:
        # Convert documents to embeddings
        embeddings = []
        for doc in input:
            # Your embedding logic here
            embedding = np.array([0.1, 0.2, 0.3], dtype=np.float32)
            embeddings.append(embedding)
        return embeddings
    
    @staticmethod
    def name() -> str:
        return "my_embedding_function"
    
    @staticmethod
    def build_from_config(config: dict) -> "MyEmbeddingFunction":
        return MyEmbeddingFunction()
    
    def get_config(self) -> dict:
        return {}

Encryption

Cmek

Customer-managed encryption key for collection data.
from chromadb import Cmek, Schema

# Create CMEK for GCP
cmek = Cmek.gcp(
    "projects/my-project/locations/us-central1/"
    "keyRings/my-ring/cryptoKeys/my-key"
)

# Use in schema
schema = Schema().set_cmek(cmek)

collection = client.create_collection(
    name="encrypted_collection",
    schema=schema
)
provider
CmekProvider
Cloud provider (currently only CmekProvider.GCP supported).
resource
str
Provider-specific resource identifier for the encryption key.
Methods:
  • gcp(resource: str): Create a CMEK for Google Cloud Platform
  • validate_pattern(): Validate the resource name format
  • to_dict(): Serialize to dictionary
  • from_dict(data: dict): Deserialize from dictionary

Build docs developers (and LLMs) love