Documentation Index
Fetch the complete documentation index at: https://mintlify.com/primeintellect-ai/verifiers/llms.txt
Use this file to discover all available pages before exploring further.
This example demonstrates building a tool-based environment where models search Wikipedia to answer trivia questions. It showcases custom tool implementation, vector search with embeddings, and LLM-as-judge evaluation.
Overview
The Wiki Search environment provides:
- Tools: Three custom Wikipedia search tools
- Search: Semantic search using embeddings (ChromaDB + OpenAI)
- Dataset: Wikipedia trivia questions
- Evaluation: LLM judge for flexible answer matching
- Format: Multi-turn tool use
Complete Implementation
Here’s the full working implementation from environments/wiki_search/wiki_search.py:
Main (Part 1)
ChromaDB Setup
Tools (Part 1)
Tools (Part 2)
Tools (Part 3)
Judge & Environment
import asyncio
import os
from typing import cast
import chromadb
from chromadb.api.types import Embeddable, EmbeddingFunction
from chromadb.utils import embedding_functions
from datasets import load_dataset
from openai import AsyncOpenAI
import verifiers as vf
from verifiers.rubrics.judge_rubric import JudgeRubric
CHROMA_DB_DIR = ".chroma_db"
_chroma_semaphore: asyncio.Semaphore | None = None
def _get_chroma_semaphore() -> asyncio.Semaphore:
global _chroma_semaphore
if _chroma_semaphore is None:
_chroma_semaphore = asyncio.Semaphore(100)
return _chroma_semaphore
def load_environment(
max_turns: int = 10,
judge_model: str = "gpt-4.1-mini",
judge_base_url: str = "https://api.openai.com/v1",
judge_api_key_var: str = "OPENAI_API_KEY",
embed_model: str = "text-embedding-3-small",
embed_base_url: str = "https://api.openai.com/v1",
embed_api_key_var: str = "OPENAI_API_KEY",
corpus_dataset: str = "willcb/rare-wiki-pages",
corpus_split: str = "train",
chroma_db_dir: str = CHROMA_DB_DIR,
) -> vf.Environment:
# Load corpus into memory
corpus = load_dataset(corpus_dataset, split=corpus_split)
page_id_to_title: dict[str, str] = {}
page_id_to_content: dict[str, str] = {}
for row in corpus:
row = cast(dict, row)
pid = row["id"]
page_id_to_title[pid] = row["title"]
page_id_to_content[pid] = row["content"]
# Lazy chroma initialization (once across all env instances)
_chroma_state: dict = {"collection": None}
def _get_collection():
if _chroma_state["collection"] is None:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
model_name=embed_model,
api_base=embed_base_url,
api_key=os.getenv(embed_api_key_var, "EMPTY"),
)
client = chromadb.PersistentClient(path=chroma_db_dir)
_chroma_state["collection"] = client.get_or_create_collection(
name="wiki_titles",
embedding_function=cast(EmbeddingFunction[Embeddable], openai_ef),
)
_init_chroma(_chroma_state["collection"])
return _chroma_state["collection"]
def _init_chroma(collection) -> None:
# Upsert missing pages
all_ids = list(page_id_to_title.keys())
existing: set[str] = set()
for i in range(0, len(all_ids), 500):
batch = all_ids[i : i + 500]
got = collection.get(ids=batch)
existing.update(got.get("ids", []))
missing = [pid for pid in all_ids if pid not in existing]
if missing:
documents = []
metadatas = []
for pid in missing:
title = str(page_id_to_title[pid]).strip()
if not title:
raise ValueError(f"Empty title for page_id {pid}")
documents.append(title)
metadatas.append({"title": title})
bs = 100
for i in range(0, len(missing), bs):
collection.upsert(
ids=missing[i : i + bs],
documents=documents[i : i + bs],
metadatas=metadatas[i : i + bs],
)
# Helper for normalizing section IDs
def normalize_id(text: str) -> str:
"""Normalize free text into an id: lowercased with spaces as underscores."""
return text.strip().lower().replace(" ", "_")
# Tool 1: Search pages by title
async def search_pages(query: str) -> list[dict]:
"""Search for top 10 relevant articles using title embedding similarity.
args:
query (str): The query to search for.
returns:
list[dict]: A list of dicts with page_id and title.
example:
"basketball" -> [{"page_id": "basketball", "title": "Basketball"}, ...]
"""
collection = _get_collection()
async with _get_chroma_semaphore():
results = await asyncio.to_thread(
collection.query, query_texts=[query], n_results=10
)
if not results or not results["metadatas"]:
raise ValueError(f"No results found for query: {query}")
output = []
for i in range(len(results["ids"][0])):
output.append({
"page_id": results["ids"][0][i],
"title": results["metadatas"][0][i]["title"],
})
return output
# Tool 2: View sections of a page
async def view_sections(page_id: str) -> list[dict]:
"""View the sections of a page.
args:
page_id (str): The ID of the page to view.
returns:
list[dict]: A list of dicts with section_id and section_name.
example:
"basketball" -> [{"section_id": "basketball:history", "section_name": "History"}, ...]
"""
content = page_id_to_content[page_id]
sections = []
lines = content.split("\n")
for i, line in enumerate(lines):
if line.startswith("#"):
section_name = line.lstrip("#").strip()
section_id = f"{page_id}:{normalize_id(section_name)}"
sections.append({
"section_id": section_id,
"section_name": section_name,
"start_line": i,
})
# If no sections, return whole page
if not sections:
sections.append({
"section_id": f"{page_id}:full",
"section_name": "Full Page",
"start_line": 0,
})
return [
{"section_id": s["section_id"], "section_name": s["section_name"]}
for s in sections
]
# Tool 3: Read a section
async def read_section(section_id: str) -> str:
"""Read a section of a page.
args:
section_id (str): The ID of the section to read.
returns:
str: The content of the section.
example:
"baseball:finnish_baseball" -> "Finnish baseball is a sport..."
"""
if ":" not in section_id:
raise ValueError(
"Invalid section_id format. Expected: page_id:section_name"
)
page_id, section_name_id = section_id.split(":", 1)
content = page_id_to_content[page_id]
lines = content.split("\n")
# Special case for "full" section
if section_name_id == "full":
return content
# Find section boundaries
section_start = None
section_end = None
for i, line in enumerate(lines):
if line.startswith("#"):
current_section = normalize_id(line.lstrip("#").strip())
if current_section == section_name_id and section_start is None:
section_start = i
elif section_start is not None and section_end is None:
section_end = i
break
if section_start is not None:
if section_end is None:
section_end = len(lines)
return "\n".join(lines[section_start:section_end])
else:
raise ValueError(f"Section not found: {section_id}")
tools = [search_pages, view_sections, read_section]
# Load trivia dataset
parser = vf.Parser()
dataset = load_dataset("willcb/wiki-trivia-questions-v4", split="train")
# Setup judge rubric
JUDGE_PROMPT = """Given a ground truth answer \
and a response, determine if the response is both correct and coherent.
Question:
Respond either "yes" or "no" only.
If a response contains incoherent text, respond with "no" even if the correct answer is also present.
"""
judge_client = AsyncOpenAI(
base_url=judge_base_url, api_key=os.environ[judge_api_key_var]
)
judge_rubric = JudgeRubric(
judge_client=judge_client,
judge_model=judge_model,
parser=parser,
judge_prompt=JUDGE_PROMPT,
)
async def judge_reward_func(judge, prompt, completion, answer, state) -> float:
judge_response = await judge(prompt, completion, answer, state)
if "yes" in judge_response.lower():
return 1.0
else:
return 0.0
system_prompt = "Use the provided Wikipedia search tools to help answer questions."
judge_rubric.add_reward_func(judge_reward_func, weight=1.0)
vf_env = vf.ToolEnv(
dataset=dataset,
system_prompt=system_prompt,
parser=parser,
rubric=judge_rubric,
tools=tools,
max_turns=max_turns,
)
return vf_env
How It Works
Three tools provide Wikipedia access:
search_pages(query) - Semantic search for relevant articles
- Uses embedding similarity (text-embedding-3-small)
- Returns top 10 matching pages with IDs and titles
view_sections(page_id) - List sections of an article
- Parses markdown headers to extract sections
- Returns section IDs and names
read_section(section_id) - Read section content
- Extracts text between markdown headers
- Returns full content of the section
2. Vector Search with ChromaDB
Embedding-based search enables semantic matching:
# Initialize ChromaDB with OpenAI embeddings
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
model_name="text-embedding-3-small",
api_base="https://api.openai.com/v1",
api_key=os.getenv("OPENAI_API_KEY"),
)
client = chromadb.PersistentClient(path=".chroma_db")
collection = client.get_or_create_collection(
name="wiki_titles",
embedding_function=openai_ef,
)
Features:
- Persistent storage (
.chroma_db directory)
- Lazy initialization (loads on first search)
- Automatic upsert of missing pages
- Concurrent access via semaphore (max 100 parallel)
3. LLM Judge Evaluation
JudgeRubric uses an LLM to evaluate answers:
judge_rubric = JudgeRubric(
judge_client=judge_client,
judge_model="gpt-4.1-mini",
judge_prompt=JUDGE_PROMPT,
)
async def judge_reward_func(judge, prompt, completion, answer, state) -> float:
judge_response = await judge(prompt, completion, answer, state)
return 1.0 if "yes" in judge_response.lower() else 0.0
Why judge evaluation?
- Trivia answers have multiple valid phrasings
- Example: “Barack Obama” = “Obama” = “Barack H. Obama”
- Judge can handle paraphrasing and context
ToolEnv provides tool calling capabilities:
vf_env = vf.ToolEnv(
dataset=dataset,
tools=[search_pages, view_sections, read_section],
max_turns=10,
rubric=judge_rubric,
)
The environment:
- Extracts tool schemas from function signatures
- Executes tool calls and returns results
- Tracks tool usage metrics
Example Interaction
Full Trajectory
Dataset Sample
Tool Usage Pattern
Question: Who painted the Mona Lisa?
Assistant: I’ll search for information about the Mona Lisa painting.Tool Call: search_pages("Mona Lisa painting")Tool Result:[
{"page_id": "mona_lisa", "title": "Mona Lisa"},
{"page_id": "leonardo_da_vinci", "title": "Leonardo da Vinci"},
{"page_id": "louvre", "title": "Louvre Museum"},
...
]
Assistant: Let me view the sections of the Mona Lisa article.Tool Call: view_sections("mona_lisa")Tool Result:[
{"section_id": "mona_lisa:description", "section_name": "Description"},
{"section_id": "mona_lisa:history", "section_name": "History"},
{"section_id": "mona_lisa:artist", "section_name": "Artist"},
...
]
Assistant: I’ll read the artist section.Tool Call: read_section("mona_lisa:artist")Tool Result:# Artist
The Mona Lisa was painted by Leonardo da Vinci in the early 16th century,
between 1503 and 1519. It is considered one of his most famous works...
Assistant: Based on the Wikipedia article, Leonardo da Vinci painted the Mona Lisa.Judge Evaluation: “yes”Result: ✓ Correct (reward = 1.0) {
"question": "Who painted the Mona Lisa?",
"answer": "Leonardo da Vinci"
}
{
"question": "What is the capital of France?",
"answer": "Paris"
}
{
"question": "Who wrote '1984'?",
"answer": "George Orwell"
}
Typical search flow:
-
Search for relevant pages
search_pages("topic keywords")
-
View sections to find relevant content
-
Read specific sections
-
Answer based on retrieved information
Efficient agents minimize tool calls by:
- Using precise search queries
- Reading only relevant sections
- Stopping once answer is found
Running the Environment
Installation
# Install with ChromaDB dependency
prime env install wiki-search
Quick Evaluation
# Evaluate on 20 questions
prime eval run wiki-search \
-m openai/gpt-4.1-mini \
-b https://api.openai.com/v1 \
-k OPENAI_API_KEY \
-n 20 \
-r 5
Custom Configuration
# Use different embedding and judge models
prime eval run wiki-search \
-m openai/gpt-4.1-mini \
-a '{
"max_turns": 15,
"judge_model": "openai/gpt-4o-mini",
"embed_model": "text-embedding-3-large",
"corpus_dataset": "willcb/rare-wiki-pages"
}' \
-n 50 \
-r 4
Configuration Options
| Parameter | Default | Description |
|---|
max_turns | 10 | Maximum tool interactions |
judge_model | "gpt-4.1-mini" | Model for evaluating answers |
judge_base_url | OpenAI API | Judge model endpoint |
judge_api_key_var | "OPENAI_API_KEY" | Env var for judge API key |
embed_model | "text-embedding-3-small" | Embedding model for search |
embed_base_url | OpenAI API | Embedding endpoint |
embed_api_key_var | "OPENAI_API_KEY" | Env var for embedding API key |
corpus_dataset | "willcb/rare-wiki-pages" | Wikipedia corpus |
chroma_db_dir | ".chroma_db" | ChromaDB storage path |
Key Features
Verifiers automatically extracts tool schemas from Python functions:
async def search_pages(query: str) -> list[dict]:
"""Search for top 10 relevant articles using title embedding similarity.
args:
query (str): The query to search for.
returns:
list[dict]: A list of dicts with page_id and title.
"""
Generated schema:
{
"name": "search_pages",
"description": "Search for top 10 relevant articles using title embedding similarity.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The query to search for."
}
},
"required": ["query"]
}
}
Concurrent Search with Semaphores
Manage concurrent ChromaDB access:
_chroma_semaphore = asyncio.Semaphore(100) # Max 100 concurrent queries
async with _chroma_semaphore:
results = await asyncio.to_thread(
collection.query, query_texts=[query], n_results=10
)
Prevents overwhelming the embedding API or ChromaDB.
Persistent Vector Storage
ChromaDB persists embeddings to disk:
- First run: Embeds all Wikipedia titles (~slow)
- Subsequent runs: Loads from
.chroma_db/ (~fast)
- Automatic incremental updates for new pages
Metrics Tracked
judge_reward: 1.0 if judge says “yes”, 0.0 otherwise
num_turns: Number of tool interactions
total_tool_calls: Total tools called
search_pages_calls: Number of search operations
view_sections_calls: Number of section list operations
read_section_calls: Number of section reads
Advanced Usage
Custom Wikipedia Corpus
Use your own Wikipedia subset:
env = load_environment(
corpus_dataset="your-username/custom-wiki-corpus",
corpus_split="train",
)
Required format:
{
"id": "page_id",
"title": "Page Title",
"content": "# Section 1\n\nContent...\n\n# Section 2\n\n..."
}
Different Embedding Models
Use alternative embedding models:
# OpenAI alternatives
env = load_environment(
embed_model="text-embedding-3-large", # Higher quality
)
# Or use sentence-transformers (requires custom setup)
# See ChromaDB docs for details
Custom Judge Prompts
Modify evaluation criteria:
CUSTOM_JUDGE_PROMPT = """Does the response contain the correct answer?
Question: {question}
Expected: {answer}
Response: {response}
Answer 'yes' only if the response contains factually correct information
matching the expected answer. Ignore formatting differences.
"""
judge_rubric = JudgeRubric(
judge_model="gpt-4.1-mini",
judge_prompt=CUSTOM_JUDGE_PROMPT,
)
Reward efficient tool usage:
async def efficiency_reward(state) -> float:
"""Reward finding answer with fewer tool calls."""
tool_calls = state.get("total_tool_calls", 0)
if tool_calls <= 3:
return 0.5 # Bonus for quick answers
return 0.0
judge_rubric.add_reward_func(efficiency_reward, weight=1.0)
Next Steps