Overview
Cactus provides built-in RAG capabilities with:
Automatic text corpus indexing
Vector similarity search
Embedded vector database
Query-time retrieval
No external vector database required - everything runs on-device.
Quick Start
Automatic RAG Setup
from cactus import cactus_init, cactus_complete, cactus_destroy
import json
# Initialize with corpus directory
model = cactus_init(
"weights/lfm2-1.2b" ,
"path/to/docs" , # Directory containing .txt files
cache_index = True # Cache embeddings for faster startup
)
# Query automatically retrieves relevant context
messages = json.dumps([{
"role" : "user" ,
"content" : "What is the return policy?"
}])
result = json.loads(cactus_complete(model, messages, None , None , None ))
print (result[ "response" ])
cactus_destroy(model)
Cactus automatically:
Chunks documents into passages
Generates embeddings for each passage
Builds a vector index
Retrieves top-k relevant passages at query time
Injects context into the prompt
Corpus Preparation
Organize your documents as text files:
docs/
├── faq.txt
├── product_info.txt
├── policies.txt
└── technical_specs.txt
Each file is automatically chunked and indexed.
Manual RAG Query
Query the RAG index directly:
from cactus import cactus_rag_query
import json
# Query without generating a response
result = json.loads(cactus_rag_query(
model,
"machine learning features" ,
top_k = 5
))
for doc in result[ "documents" ]:
print ( f "Score: { doc[ 'score' ] :.3f} " )
print ( f "Content: { doc[ 'content' ][: 200 ] } ... \n " )
{
"success" : true ,
"documents" : [
{
"id" : 42 ,
"score" : 0.87 ,
"content" : "Machine learning models can be deployed..." ,
"metadata" : "source: ml_guide.txt"
},
{
"id" : 15 ,
"score" : 0.82 ,
"content" : "On-device inference provides..." ,
"metadata" : "source: inference.txt"
}
],
"query_time_ms" : 12.5
}
Custom Vector Index
Use the vector index API for more control:
from cactus import (
cactus_init,
cactus_embed,
cactus_index_init,
cactus_index_add,
cactus_index_query,
cactus_index_destroy,
cactus_destroy
)
import json
# Initialize embedding model
model = cactus_init( "weights/qwen3-embedding-0.6b" , None , False )
# Create vector index
index = cactus_index_init( "/path/to/index" , embedding_dim = 768 )
# Add documents
documents = [
"Cactus is an AI inference engine for mobile devices." ,
"It supports NPU acceleration on Apple chips." ,
"Quantization reduces model size by 70-90%."
]
ids = list ( range ( len (documents)))
embeddings = [cactus_embed(model, doc, True ) for doc in documents]
cactus_index_add(index, ids, documents, embeddings, None )
# Query the index
query = "How does Cactus optimize for mobile?"
query_emb = cactus_embed(model, query, True )
options = json.dumps({ "top_k" : 2 , "score_threshold" : 0.5 })
results = json.loads(cactus_index_query(index, query_emb, options))
for result in results[ "results" ]:
doc_id = result[ "id" ]
print ( f "Score: { result[ 'score' ] :.3f} " )
print ( f "Document: { documents[doc_id] } \n " )
cactus_index_destroy(index)
cactus_destroy(model)
Store metadata with documents:
documents = [
"Product A costs $99" ,
"Product B costs $149"
]
metadatas = [
json.dumps({ "category" : "pricing" , "product" : "A" }),
json.dumps({ "category" : "pricing" , "product" : "B" })
]
cactus_index_add(index, ids, documents, embeddings, metadatas)
# Retrieve metadata
result = json.loads(cactus_index_get(index, [ 0 ]))
for doc in result[ "documents" ]:
print ( f "Content: { doc[ 'content' ] } " )
print ( f "Metadata: { doc[ 'metadata' ] } " )
Updating the Index
Add New Documents
new_docs = [ "New feature: Cloud fallback" ]
new_ids = [ 100 ]
new_embeddings = [cactus_embed(model, doc, True ) for doc in new_docs]
cactus_index_add(index, new_ids, new_docs, new_embeddings, None )
Delete Documents
cactus_index_delete(index, [ 5 , 10 , 15 ])
Compact Index
# Reclaim space from deleted documents
cactus_index_compact(index)
Query Options
{
"top_k" : 10 ,
"score_threshold" : 0.7
}
Maximum number of results to return
Minimum similarity score (0-1). Results below threshold are filtered. -1 disables filtering
Hybrid Search
Combine semantic and keyword search:
def hybrid_search ( query , documents , embeddings , top_k = 5 ):
# Semantic search
query_emb = cactus_embed(model, query, True )
semantic_results = cactus_index_query(index, query_emb,
json.dumps({ "top_k" : top_k}))
# Keyword search
keyword_results = []
query_lower = query.lower()
for i, doc in enumerate (documents):
if query_lower in doc.lower():
keyword_results.append(i)
# Combine results
combined = set (r[ "id" ] for r in json.loads(semantic_results)[ "results" ])
combined.update(keyword_results)
return list (combined)[:top_k]
Reranking
Rerank retrieved documents for better relevance:
def rerank ( query , retrieved_docs ):
# Generate query embedding
query_emb = cactus_embed(model, query, True )
# Score each document
scores = []
for doc in retrieved_docs:
doc_emb = cactus_embed(model, doc[ "content" ], True )
score = np.dot(query_emb, doc_emb)
scores.append((score, doc))
# Sort by score
scores.sort( reverse = True )
return [doc for _, doc in scores]
Chunking Strategies
Fixed Size
Sentence-Based
Semantic
def chunk_fixed ( text , chunk_size = 512 , overlap = 50 ):
words = text.split()
chunks = []
for i in range ( 0 , len (words), chunk_size - overlap):
chunk = ' ' .join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
def chunk_sentences ( text , max_sentences = 5 ):
sentences = text.split( '. ' )
chunks = []
for i in range ( 0 , len (sentences), max_sentences):
chunk = '. ' .join(sentences[i:i + max_sentences])
chunks.append(chunk)
return chunks
def chunk_semantic ( text , model ):
paragraphs = text.split( ' \n\n ' )
chunks = []
current_chunk = []
for para in paragraphs:
current_chunk.append(para)
chunk_text = ' \n\n ' .join(current_chunk)
if len (chunk_text.split()) > 500 :
chunks.append(chunk_text)
current_chunk = []
if current_chunk:
chunks.append( ' \n\n ' .join(current_chunk))
return chunks
Use cache_index=True to avoid recomputing embeddings on startup
Chunk documents to 256-512 tokens for optimal retrieval
Use batch embedding generation for large corpora
Enable NPU acceleration for faster embedding generation
The index is stored in two files:
index.bin - Vector embeddings (FP16) and metadata
data.bin - Document content and metadata strings
Both use memory-mapped I/O for efficient access.
Error Handling
try :
result = json.loads(cactus_rag_query(model, query, top_k = 5 ))
if not result[ "success" ]:
print ( f "RAG query failed: { result.get( 'error' ) } " )
except RuntimeError as e:
print ( f "Error: { e } " )
Next Steps
Embeddings Guide Learn about embedding generation
Vector Index API Complete vector database API reference
Chat Completion Use RAG with chat completion
Supported Models Browse embedding models