Wikipedia Search Tool

This example demonstrates building a tool-based environment where models search Wikipedia to answer trivia questions. It showcases custom tool implementation, vector search with embeddings, and LLM-as-judge evaluation.

Overview

The Wiki Search environment provides:

Tools: Three custom Wikipedia search tools
Search: Semantic search using embeddings (ChromaDB + OpenAI)
Dataset: Wikipedia trivia questions
Evaluation: LLM judge for flexible answer matching
Format: Multi-turn tool use

Complete Implementation

Here’s the full working implementation from environments/wiki_search/wiki_search.py:

import asyncio
import os
from typing import cast

import chromadb
from chromadb.api.types import Embeddable, EmbeddingFunction
from chromadb.utils import embedding_functions
from datasets import load_dataset
from openai import AsyncOpenAI

import verifiers as vf
from verifiers.rubrics.judge_rubric import JudgeRubric

CHROMA_DB_DIR = ".chroma_db"
_chroma_semaphore: asyncio.Semaphore | None = None

def _get_chroma_semaphore() -> asyncio.Semaphore:
    global _chroma_semaphore
    if _chroma_semaphore is None:
        _chroma_semaphore = asyncio.Semaphore(100)
    return _chroma_semaphore

def load_environment(
    max_turns: int = 10,
    judge_model: str = "gpt-4.1-mini",
    judge_base_url: str = "https://api.openai.com/v1",
    judge_api_key_var: str = "OPENAI_API_KEY",
    embed_model: str = "text-embedding-3-small",
    embed_base_url: str = "https://api.openai.com/v1",
    embed_api_key_var: str = "OPENAI_API_KEY",
    corpus_dataset: str = "willcb/rare-wiki-pages",
    corpus_split: str = "train",
    chroma_db_dir: str = CHROMA_DB_DIR,
) -> vf.Environment:
    # Load corpus into memory
    corpus = load_dataset(corpus_dataset, split=corpus_split)
    page_id_to_title: dict[str, str] = {}
    page_id_to_content: dict[str, str] = {}
    for row in corpus:
        row = cast(dict, row)
        pid = row["id"]
        page_id_to_title[pid] = row["title"]
        page_id_to_content[pid] = row["content"]

    # Lazy chroma initialization (once across all env instances)
    _chroma_state: dict = {"collection": None}

    def _get_collection():
        if _chroma_state["collection"] is None:
            openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                model_name=embed_model,
                api_base=embed_base_url,
                api_key=os.getenv(embed_api_key_var, "EMPTY"),
            )
            client = chromadb.PersistentClient(path=chroma_db_dir)
            _chroma_state["collection"] = client.get_or_create_collection(
                name="wiki_titles",
                embedding_function=cast(EmbeddingFunction[Embeddable], openai_ef),
            )
            _init_chroma(_chroma_state["collection"])
        return _chroma_state["collection"]

    def _init_chroma(collection) -> None:
        # Upsert missing pages
        all_ids = list(page_id_to_title.keys())
        existing: set[str] = set()
        for i in range(0, len(all_ids), 500):
            batch = all_ids[i : i + 500]
            got = collection.get(ids=batch)
            existing.update(got.get("ids", []))
        missing = [pid for pid in all_ids if pid not in existing]
        if missing:
            documents = []
            metadatas = []
            for pid in missing:
                title = str(page_id_to_title[pid]).strip()
                if not title:
                    raise ValueError(f"Empty title for page_id {pid}")
                documents.append(title)
                metadatas.append({"title": title})
            bs = 100
            for i in range(0, len(missing), bs):
                collection.upsert(
                    ids=missing[i : i + bs],
                    documents=documents[i : i + bs],
                    metadatas=metadatas[i : i + bs],
                )

    # Helper for normalizing section IDs
    def normalize_id(text: str) -> str:
        """Normalize free text into an id: lowercased with spaces as underscores."""
        return text.strip().lower().replace(" ", "_")

    # Tool 1: Search pages by title
    async def search_pages(query: str) -> list[dict]:
        """Search for top 10 relevant articles using title embedding similarity.

        args:
            query (str): The query to search for.

        returns:
            list[dict]: A list of dicts with page_id and title.

        example:
            "basketball" -> [{"page_id": "basketball", "title": "Basketball"}, ...]
        """
        collection = _get_collection()
        async with _get_chroma_semaphore():
            results = await asyncio.to_thread(
                collection.query, query_texts=[query], n_results=10
            )
        if not results or not results["metadatas"]:
            raise ValueError(f"No results found for query: {query}")
        output = []
        for i in range(len(results["ids"][0])):
            output.append({
                "page_id": results["ids"][0][i],
                "title": results["metadatas"][0][i]["title"],
            })
        return output

    # Tool 2: View sections of a page
    async def view_sections(page_id: str) -> list[dict]:
        """View the sections of a page.

        args:
            page_id (str): The ID of the page to view.

        returns:
            list[dict]: A list of dicts with section_id and section_name.

        example:
            "basketball" -> [{"section_id": "basketball:history", "section_name": "History"}, ...]
        """
        content = page_id_to_content[page_id]
        sections = []
        lines = content.split("\n")
        for i, line in enumerate(lines):
            if line.startswith("#"):
                section_name = line.lstrip("#").strip()
                section_id = f"{page_id}:{normalize_id(section_name)}"
                sections.append({
                    "section_id": section_id,
                    "section_name": section_name,
                    "start_line": i,
                })
        
        # If no sections, return whole page
        if not sections:
            sections.append({
                "section_id": f"{page_id}:full",
                "section_name": "Full Page",
                "start_line": 0,
            })
        
        return [
            {"section_id": s["section_id"], "section_name": s["section_name"]}
            for s in sections
        ]

    # Tool 3: Read a section
    async def read_section(section_id: str) -> str:
        """Read a section of a page.

        args:
            section_id (str): The ID of the section to read.

        returns:
            str: The content of the section.

        example:
            "baseball:finnish_baseball" -> "Finnish baseball is a sport..."
        """
        if ":" not in section_id:
            raise ValueError(
                "Invalid section_id format. Expected: page_id:section_name"
            )
        page_id, section_name_id = section_id.split(":", 1)
        
        content = page_id_to_content[page_id]
        lines = content.split("\n")
        
        # Special case for "full" section
        if section_name_id == "full":
            return content
        
        # Find section boundaries
        section_start = None
        section_end = None
        
        for i, line in enumerate(lines):
            if line.startswith("#"):
                current_section = normalize_id(line.lstrip("#").strip())
                if current_section == section_name_id and section_start is None:
                    section_start = i
                elif section_start is not None and section_end is None:
                    section_end = i
                    break
        
        if section_start is not None:
            if section_end is None:
                section_end = len(lines)
            return "\n".join(lines[section_start:section_end])
        else:
            raise ValueError(f"Section not found: {section_id}")

    tools = [search_pages, view_sections, read_section]

    # Load trivia dataset
    parser = vf.Parser()
    dataset = load_dataset("willcb/wiki-trivia-questions-v4", split="train")

    # Setup judge rubric
    JUDGE_PROMPT = """Given a ground truth answer \
    and a response, determine if the response is both correct and coherent.

    Question:


Ground truth answer:


Response:

Respond either "yes" or "no" only.

If a response contains incoherent text, respond with "no" even if the correct answer is also present.
"""
judge_client = AsyncOpenAI(
    base_url=judge_base_url, api_key=os.environ[judge_api_key_var]
)
judge_rubric = JudgeRubric(
    judge_client=judge_client,
    judge_model=judge_model,
    parser=parser,
    judge_prompt=JUDGE_PROMPT,
)

async def judge_reward_func(judge, prompt, completion, answer, state) -> float:
    judge_response = await judge(prompt, completion, answer, state)
    if "yes" in judge_response.lower():
        return 1.0
    else:
        return 0.0

system_prompt = "Use the provided Wikipedia search tools to help answer questions."
judge_rubric.add_reward_func(judge_reward_func, weight=1.0)
vf_env = vf.ToolEnv(
    dataset=dataset,
    system_prompt=system_prompt,
    parser=parser,
    rubric=judge_rubric,
    tools=tools,
    max_turns=max_turns,
)
return vf_env

How It Works

1. Custom Tool Implementation

Three tools provide Wikipedia access: search_pages(query) - Semantic search for relevant articles

Uses embedding similarity (text-embedding-3-small)
Returns top 10 matching pages with IDs and titles

view_sections(page_id) - List sections of an article

Parses markdown headers to extract sections
Returns section IDs and names

read_section(section_id) - Read section content

Extracts text between markdown headers
Returns full content of the section

2. Vector Search with ChromaDB

Embedding-based search enables semantic matching:

# Initialize ChromaDB with OpenAI embeddings
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    model_name="text-embedding-3-small",
    api_base="https://api.openai.com/v1",
    api_key=os.getenv("OPENAI_API_KEY"),
)

client = chromadb.PersistentClient(path=".chroma_db")
collection = client.get_or_create_collection(
    name="wiki_titles",
    embedding_function=openai_ef,
)

Features:

Persistent storage (.chroma_db directory)
Lazy initialization (loads on first search)
Automatic upsert of missing pages
Concurrent access via semaphore (max 100 parallel)

3. LLM Judge Evaluation

JudgeRubric uses an LLM to evaluate answers:

judge_rubric = JudgeRubric(
    judge_client=judge_client,
    judge_model="gpt-4.1-mini",
    judge_prompt=JUDGE_PROMPT,
)

async def judge_reward_func(judge, prompt, completion, answer, state) -> float:
    judge_response = await judge(prompt, completion, answer, state)
    return 1.0 if "yes" in judge_response.lower() else 0.0

Why judge evaluation?

Trivia answers have multiple valid phrasings
Example: “Barack Obama” = “Obama” = “Barack H. Obama”
Judge can handle paraphrasing and context

4. Tool Environment

ToolEnv provides tool calling capabilities:

vf_env = vf.ToolEnv(
    dataset=dataset,
    tools=[search_pages, view_sections, read_section],
    max_turns=10,
    rubric=judge_rubric,
)

The environment:

Extracts tool schemas from function signatures
Executes tool calls and returns results
Tracks tool usage metrics

Example Interaction

Full Trajectory
Dataset Sample
Tool Usage Pattern

Question: Who painted the Mona Lisa?

Assistant: I’ll search for information about the Mona Lisa painting.Tool Call: search_pages("Mona Lisa painting")Tool Result:

[
  {"page_id": "mona_lisa", "title": "Mona Lisa"},
  {"page_id": "leonardo_da_vinci", "title": "Leonardo da Vinci"},
  {"page_id": "louvre", "title": "Louvre Museum"},
  ...
]

Assistant: Let me view the sections of the Mona Lisa article.Tool Call: view_sections("mona_lisa")Tool Result:

[
  {"section_id": "mona_lisa:description", "section_name": "Description"},
  {"section_id": "mona_lisa:history", "section_name": "History"},
  {"section_id": "mona_lisa:artist", "section_name": "Artist"},
  ...
]

Assistant: I’ll read the artist section.Tool Call: read_section("mona_lisa:artist")Tool Result:

# Artist

The Mona Lisa was painted by Leonardo da Vinci in the early 16th century, 
between 1503 and 1519. It is considered one of his most famous works...

Assistant: Based on the Wikipedia article, Leonardo da Vinci painted the Mona Lisa.Judge Evaluation: “yes”Result: ✓ Correct (reward = 1.0)

{
  "question": "Who painted the Mona Lisa?",
  "answer": "Leonardo da Vinci"
}

{
  "question": "What is the capital of France?",
  "answer": "Paris"
}

{
  "question": "Who wrote '1984'?",
  "answer": "George Orwell"
}

Typical search flow:

Search for relevant pages
```
search_pages("topic keywords")
```
View sections to find relevant content
```
view_sections(page_id)
```
Read specific sections
```
read_section(section_id)
```
Answer based on retrieved information

Efficient agents minimize tool calls by:

Using precise search queries
Reading only relevant sections
Stopping once answer is found

Running the Environment

Installation

# Install with ChromaDB dependency
prime env install wiki-search

Quick Evaluation

# Evaluate on 20 questions
prime eval run wiki-search \
  -m openai/gpt-4.1-mini \
  -b https://api.openai.com/v1 \
  -k OPENAI_API_KEY \
  -n 20 \
  -r 5

Custom Configuration

# Use different embedding and judge models
prime eval run wiki-search \
  -m openai/gpt-4.1-mini \
  -a '{
    "max_turns": 15,
    "judge_model": "openai/gpt-4o-mini",
    "embed_model": "text-embedding-3-large",
    "corpus_dataset": "willcb/rare-wiki-pages"
  }' \
  -n 50 \
  -r 4

Configuration Options

Parameter	Default	Description
`max_turns`	`10`	Maximum tool interactions
`judge_model`	`"gpt-4.1-mini"`	Model for evaluating answers
`judge_base_url`	OpenAI API	Judge model endpoint
`judge_api_key_var`	`"OPENAI_API_KEY"`	Env var for judge API key
`embed_model`	`"text-embedding-3-small"`	Embedding model for search
`embed_base_url`	OpenAI API	Embedding endpoint
`embed_api_key_var`	`"OPENAI_API_KEY"`	Env var for embedding API key
`corpus_dataset`	`"willcb/rare-wiki-pages"`	Wikipedia corpus
`chroma_db_dir`	`".chroma_db"`	ChromaDB storage path

Key Features

Tool Schema Generation

Verifiers automatically extracts tool schemas from Python functions:

async def search_pages(query: str) -> list[dict]:
    """Search for top 10 relevant articles using title embedding similarity.

    args:
        query (str): The query to search for.

    returns:
        list[dict]: A list of dicts with page_id and title.
    """

Generated schema:

{
  "name": "search_pages",
  "description": "Search for top 10 relevant articles using title embedding similarity.",
  "parameters": {
    "type": "object",
    "properties": {
      "query": {
        "type": "string",
        "description": "The query to search for."
      }
    },
    "required": ["query"]
  }
}

Concurrent Search with Semaphores

Manage concurrent ChromaDB access:

_chroma_semaphore = asyncio.Semaphore(100)  # Max 100 concurrent queries

async with _chroma_semaphore:
    results = await asyncio.to_thread(
        collection.query, query_texts=[query], n_results=10
    )

Prevents overwhelming the embedding API or ChromaDB.

Persistent Vector Storage

ChromaDB persists embeddings to disk:

First run: Embeds all Wikipedia titles (~slow)
Subsequent runs: Loads from .chroma_db/ (~fast)
Automatic incremental updates for new pages

Metrics Tracked

judge_reward: 1.0 if judge says “yes”, 0.0 otherwise
num_turns: Number of tool interactions
total_tool_calls: Total tools called
search_pages_calls: Number of search operations
view_sections_calls: Number of section list operations
read_section_calls: Number of section reads

Advanced Usage

Custom Wikipedia Corpus

Use your own Wikipedia subset:

env = load_environment(
    corpus_dataset="your-username/custom-wiki-corpus",
    corpus_split="train",
)

Required format:

{
    "id": "page_id",
    "title": "Page Title",
    "content": "# Section 1\n\nContent...\n\n# Section 2\n\n..."
}

Different Embedding Models

Use alternative embedding models:

# OpenAI alternatives
env = load_environment(
    embed_model="text-embedding-3-large",  # Higher quality
)

# Or use sentence-transformers (requires custom setup)
# See ChromaDB docs for details

Custom Judge Prompts

Modify evaluation criteria:

CUSTOM_JUDGE_PROMPT = """Does the response contain the correct answer?

Question: {question}
Expected: {answer}
Response: {response}

Answer 'yes' only if the response contains factually correct information 
matching the expected answer. Ignore formatting differences.
"""

judge_rubric = JudgeRubric(
    judge_model="gpt-4.1-mini",
    judge_prompt=CUSTOM_JUDGE_PROMPT,
)

Adding Tool Call Efficiency Rewards

Reward efficient tool usage:

async def efficiency_reward(state) -> float:
    """Reward finding answer with fewer tool calls."""
    tool_calls = state.get("total_tool_calls", 0)
    if tool_calls <= 3:
        return 0.5  # Bonus for quick answers
    return 0.0

judge_rubric.add_reward_func(efficiency_reward, weight=1.0)

GSM8K - Single-turn without tools
Math Python - Code execution tools
Wordle - Multi-turn game environment
Browser Examples - Complex stateful tools

Next Steps

Learn about ToolEnv architecture
See Custom Tools for implementing your own
Explore JudgeRubric for LLM evaluation
Check out ChromaDB for vector storage

Example Environments

Wikipedia Search Tool

Overview

Complete Implementation

How It Works

1. Custom Tool Implementation

2. Vector Search with ChromaDB

3. LLM Judge Evaluation

4. Tool Environment

Example Interaction

Running the Environment

Installation

Quick Evaluation

Custom Configuration

Configuration Options

Key Features

Tool Schema Generation

Concurrent Search with Semaphores

Persistent Vector Storage

Metrics Tracked

Advanced Usage

Custom Wikipedia Corpus

Different Embedding Models

Custom Judge Prompts

Adding Tool Call Efficiency Rewards

Next Steps

Build docs developers (and LLMs) love

Example Environments

Documentation Index

​Overview

​Complete Implementation

​How It Works

​1. Custom Tool Implementation

​2. Vector Search with ChromaDB

​3. LLM Judge Evaluation

​4. Tool Environment

​Example Interaction

​Running the Environment

​Installation

​Quick Evaluation

​Custom Configuration

​Configuration Options

​Key Features

​Tool Schema Generation

​Concurrent Search with Semaphores

​Persistent Vector Storage

​Metrics Tracked

​Advanced Usage

​Custom Wikipedia Corpus

​Different Embedding Models

​Custom Judge Prompts

​Adding Tool Call Efficiency Rewards

​Related Examples

​Next Steps

Build docs developers (and LLMs) love

Overview

Complete Implementation

How It Works

1. Custom Tool Implementation

2. Vector Search with ChromaDB

3. LLM Judge Evaluation

4. Tool Environment

Example Interaction

Running the Environment

Installation

Quick Evaluation

Custom Configuration

Configuration Options

Key Features

Tool Schema Generation

Concurrent Search with Semaphores

Persistent Vector Storage

Metrics Tracked

Advanced Usage

Custom Wikipedia Corpus

Different Embedding Models

Custom Judge Prompts

Adding Tool Call Efficiency Rewards

Related Examples

Next Steps