Documentation Index Fetch the complete documentation index at: https://mintlify.com/BerriAI/litellm/llms.txt
Use this file to discover all available pages before exploring further.
Overview
Embeddings convert text into numerical vectors for similarity search, retrieval, and RAG applications. LiteLLM provides a unified interface for embeddings across OpenAI, Cohere, HuggingFace, and more.
Quick Start
from litellm import embedding
response = embedding(
model = "text-embedding-3-small" ,
input = [ "Text to embed" ]
)
vector = response.data[ 0 ].embedding
print ( f "Vector length: { len (vector) } " )
Basic Usage
Single Text
Multiple Texts
from litellm import embedding
response = embedding(
model = "text-embedding-3-small" ,
input = "The quick brown fox jumps over the lazy dog"
)
vector = response.data[ 0 ].embedding
from litellm import embedding
texts = [
"First document" ,
"Second document" ,
"Third document"
]
response = embedding(
model = "text-embedding-3-small" ,
input = texts
)
# Get all embeddings
vectors = [data.embedding for data in response.data]
Providers
OpenAI
Cohere
HuggingFace
Ollama
Azure OpenAI
Latest embedding models with high quality. from litellm import embedding
# text-embedding-3-small - Fast and efficient
response = embedding(
model = "text-embedding-3-small" ,
input = [ "Text to embed" ]
)
# text-embedding-3-large - Higher quality
response = embedding(
model = "text-embedding-3-large" ,
input = [ "Text to embed" ]
)
# ada-002 - Previous generation
response = embedding(
model = "text-embedding-ada-002" ,
input = [ "Text to embed" ]
)
Specialized embeddings for search and classification. from litellm import embedding
# For search queries
response = embedding(
model = "cohere/embed-english-v3.0" ,
input = [ "search query" ],
input_type = "search_query"
)
# For documents
response = embedding(
model = "cohere/embed-english-v3.0" ,
input = [ "document content" ],
input_type = "search_document"
)
# Multilingual
response = embedding(
model = "cohere/embed-multilingual-v3.0" ,
input = [ "texto en español" ]
)
Open-source embedding models. from litellm import embedding
# Sentence Transformers
response = embedding(
model = "huggingface/sentence-transformers/all-MiniLM-L6-v2" ,
input = [ "Text to embed" ]
)
# BGE models
response = embedding(
model = "huggingface/BAAI/bge-large-en-v1.5" ,
input = [ "Text to embed" ]
)
Local embedding models. from litellm import embedding
response = embedding(
model = "ollama/nomic-embed-text" ,
input = [ "Text to embed" ],
api_base = "http://localhost:11434"
)
from litellm import embedding
import os
response = embedding(
model = "azure/text-embedding-ada-002" ,
input = [ "Text to embed" ],
api_key = os.environ[ "AZURE_API_KEY" ],
api_base = os.environ[ "AZURE_API_BASE" ],
api_version = "2023-07-01-preview"
)
Dimensions Control
Some providers allow controlling output dimensions.
from litellm import embedding
# OpenAI - Reduce dimensions for storage efficiency
response = embedding(
model = "text-embedding-3-large" ,
input = [ "Text to embed" ],
dimensions = 256 # Default is 3072 for 3-large
)
# Cohere - Control output dimension
response = embedding(
model = "cohere/embed-english-v3.0" ,
input = [ "Text to embed" ],
dimensions = 384
)
from litellm import embedding
response = embedding(
model = "text-embedding-3-small" ,
input = [ "Text to embed" ],
encoding_format = "float" # or "base64"
)
# Float format (default)
vector = response.data[ 0 ].embedding # List of floats
# Base64 format - more compact for transmission
response = embedding(
model = "text-embedding-3-small" ,
input = [ "Text to embed" ],
encoding_format = "base64"
)
Batch Processing
Process large datasets efficiently.
from litellm import embedding
from typing import List
def embed_in_batches ( texts : List[ str ], batch_size : int = 100 ):
all_embeddings = []
for i in range ( 0 , len (texts), batch_size):
batch = texts[i:i + batch_size]
response = embedding(
model = "text-embedding-3-small" ,
input = batch
)
batch_embeddings = [data.embedding for data in response.data]
all_embeddings.extend(batch_embeddings)
return all_embeddings
# Process 1000 documents
texts = [ f "Document { i } " for i in range ( 1000 )]
embeddings = embed_in_batches(texts)
Similarity Search
from litellm import embedding
import numpy as np
def cosine_similarity ( a , b ):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# Embed documents
documents = [
"Python is a programming language" ,
"JavaScript is used for web development" ,
"Machine learning uses neural networks"
]
response = embedding(
model = "text-embedding-3-small" ,
input = documents
)
doc_embeddings = [data.embedding for data in response.data]
# Embed query
query = "What is Python?"
query_response = embedding(
model = "text-embedding-3-small" ,
input = [query]
)
query_embedding = query_response.data[ 0 ].embedding
# Find most similar
similarities = [
cosine_similarity(query_embedding, doc_emb)
for doc_emb in doc_embeddings
]
best_idx = np.argmax(similarities)
print ( f "Most similar: { documents[best_idx] } " )
print ( f "Similarity: { similarities[best_idx] :.4f} " )
RAG (Retrieval Augmented Generation)
from litellm import embedding, completion
import numpy as np
# 1. Embed knowledge base
knowledge_base = [
"LiteLLM is a unified interface for LLMs" ,
"LiteLLM supports 100+ providers" ,
"LiteLLM handles automatic retries and fallbacks"
]
kb_response = embedding(
model = "text-embedding-3-small" ,
input = knowledge_base
)
kb_embeddings = [data.embedding for data in kb_response.data]
# 2. Embed user query
query = "What does LiteLLM do?"
query_response = embedding(
model = "text-embedding-3-small" ,
input = [query]
)
query_embedding = query_response.data[ 0 ].embedding
# 3. Find relevant documents
def cosine_similarity ( a , b ):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
similarities = [
cosine_similarity(query_embedding, kb_emb)
for kb_emb in kb_embeddings
]
top_k = 2
top_indices = np.argsort(similarities)[ - top_k:][:: - 1 ]
relevant_docs = [knowledge_base[i] for i in top_indices]
# 4. Generate answer with context
context = " \n " .join(relevant_docs)
response = completion(
model = "gpt-4o-mini" ,
messages = [{
"role" : "user" ,
"content" : f "Context: \n { context } \n\n Question: { query } "
}]
)
print (response.choices[ 0 ].message.content)
Async Embeddings
import asyncio
from litellm import aembedding
async def embed_async ():
response = await aembedding(
model = "text-embedding-3-small" ,
input = [ "Text to embed" ]
)
return response.data[ 0 ].embedding
vector = asyncio.run(embed_async())
Parallel Processing
import asyncio
from litellm import aembedding
async def embed_multiple_models ( text ):
tasks = [
aembedding( model = "text-embedding-3-small" , input = [text]),
aembedding( model = "text-embedding-3-large" , input = [text]),
aembedding( model = "cohere/embed-english-v3.0" , input = [text])
]
responses = await asyncio.gather( * tasks)
return {
"small" : responses[ 0 ].data[ 0 ].embedding,
"large" : responses[ 1 ].data[ 0 ].embedding,
"cohere" : responses[ 2 ].data[ 0 ].embedding
}
results = asyncio.run(embed_multiple_models( "Compare embeddings" ))
Caching
Cache embeddings to reduce API calls.
from litellm import embedding
import litellm
import hashlib
# Enable caching
litellm.cache = litellm.Cache()
def get_cached_embedding ( text : str , model : str ):
# Embeddings are automatically cached by LiteLLM
response = embedding(
model = model,
input = [text],
caching = True
)
return response.data[ 0 ].embedding
# First call - API request
vec1 = get_cached_embedding( "Hello world" , "text-embedding-3-small" )
# Second call - cached
vec2 = get_cached_embedding( "Hello world" , "text-embedding-3-small" )
Usage Tracking
from litellm import embedding
response = embedding(
model = "text-embedding-3-small" ,
input = [ "Text 1" , "Text 2" , "Text 3" ]
)
# Token usage
print ( f "Tokens used: { response.usage.total_tokens } " )
print ( f "Prompt tokens: { response.usage.prompt_tokens } " )
# Cost (if available)
if hasattr (response, '_hidden_params' ):
cost = response._hidden_params.get( 'response_cost' )
if cost:
print ( f "Cost: $ { cost } " )
Error Handling
from litellm import embedding
from litellm.exceptions import APIError, RateLimitError
try :
response = embedding(
model = "text-embedding-3-small" ,
input = [ "Very long text..." * 10000 ] # May exceed token limit
)
except RateLimitError as e:
print ( f "Rate limit exceeded: { e } " )
except APIError as e:
print ( f "API error: { e.status_code } - { e.message } " )
Model Comparison
Model Provider Dimensions Max Tokens Use Case text-embedding-3-small OpenAI 1536 8191 General purpose, fast text-embedding-3-large OpenAI 3072 8191 High quality embed-english-v3.0 Cohere 1024 - Search, classification all-MiniLM-L6-v2 HuggingFace 384 256 Fast, local bge-large-en-v1.5 HuggingFace 1024 512 High quality nomic-embed-text Ollama 768 - Local, privacy
Best Practices
Use text-embedding-3-small for most use cases
Use text-embedding-3-large for highest quality
Use Cohere for specialized search applications
Use Ollama for privacy-sensitive applications
Use smaller models when quality difference is minimal
Reduce dimensions to save storage and compute
Cache embeddings to avoid re-computing
Batch process to reduce API overhead
Normalize text before embedding
Keep consistent text format across corpus
Use same model for queries and documents
Test multiple models for your specific use case
Advanced Patterns
Hybrid Search
Embedding Store
Combine embeddings with keyword search. from litellm import embedding
import numpy as np
def hybrid_search ( query , documents , alpha = 0.5 ):
# Semantic search
query_emb = embedding(
model = "text-embedding-3-small" ,
input = [query]
).data[ 0 ].embedding
doc_embs = embedding(
model = "text-embedding-3-small" ,
input = documents
)
semantic_scores = [
np.dot(query_emb, d.embedding)
for d in doc_embs.data
]
# Keyword search (simple)
keyword_scores = [
sum (word in doc.lower() for word in query.lower().split())
for doc in documents
]
# Combine scores
combined = [
alpha * sem + ( 1 - alpha) * kw
for sem, kw in zip (semantic_scores, keyword_scores)
]
return np.argmax(combined)
Simple vector store implementation. from litellm import embedding
import numpy as np
from typing import List, Tuple
class EmbeddingStore :
def __init__ ( self , model = "text-embedding-3-small" ):
self .model = model
self .texts = []
self .embeddings = []
def add ( self , texts : List[ str ]):
response = embedding(
model = self .model,
input = texts
)
new_embeddings = [d.embedding for d in response.data]
self .texts.extend(texts)
self .embeddings.extend(new_embeddings)
def search ( self , query : str , top_k : int = 5 ) -> List[Tuple[ str , float ]]:
query_emb = embedding(
model = self .model,
input = [query]
).data[ 0 ].embedding
similarities = [
np.dot(query_emb, emb) /
(np.linalg.norm(query_emb) * np.linalg.norm(emb))
for emb in self .embeddings
]
top_indices = np.argsort(similarities)[ - top_k:][:: - 1 ]
return [( self .texts[i], similarities[i]) for i in top_indices]