Documentation Index
Fetch the complete documentation index at: https://mintlify.com/primeintellect-ai/verifiers/llms.txt
Use this file to discover all available pages before exploring further.
Reward Functions
Reward functions score model outputs and are the core evaluation mechanism in Verifiers.
Overview
Reward functions come in two flavors:
- Individual: Score single rollouts (most common)
- Group: Score multiple rollouts together (for comparative evaluation)
Both types use flexible signatures that automatically receive relevant data from the state.
Type Definitions
IndividualRewardFunc = Callable[..., float | Awaitable[float]]
GroupRewardFunc = Callable[..., list[float] | Awaitable[list[float]]]
RewardFunc = IndividualRewardFunc | GroupRewardFunc
Individual Reward Functions
Functions that score a single rollout at a time.
Signature
def reward_fn(
# Any combination of these parameters:
prompt: Messages | str = ...,
completion: Messages | str = ...,
answer: Any = ...,
task: str = ...,
state: State = ...,
info: dict = ...,
# Plus any class objects registered via rubric.add_class_object()
parser: vf.Parser = ...,
custom_obj: Any = ...,
# Catch-all
**kwargs
) -> float:
"""Return score between 0.0 and 1.0 (or any float)."""
...
Available Parameters
The input prompt (from state[“prompt”]).
The model’s final completion (from state[“completion”]).
Ground truth answer from dataset (from state[“answer”]).
Task identifier (from state[“task”]).
Full state dictionary with trajectory, timing, etc.
Additional metadata from dataset (from state[“info”]).
Catches class objects and extra fields. Always include for forward compatibility.
Examples
Simple Exact Match
def exact_match(answer: str, completion: str, **kwargs) -> float:
"""Check if answer appears in completion."""
return 1.0 if answer.lower() in completion.lower() else 0.0
Using Parser
def parsed_match(answer: str, completion: str, parser: vf.Parser, **kwargs) -> float:
"""Extract answer and compare."""
extracted = parser.parse_answer(completion)
return 1.0 if extracted == answer else 0.0
State-based
def efficiency_reward(state: vf.State, **kwargs) -> float:
"""Reward shorter trajectories."""
num_turns = len(state["trajectory"])
return max(0.0, 1.0 - num_turns / 10)
Async Reward
async def llm_judge(completion: str, answer: str, **kwargs) -> float:
"""Use LLM to judge quality."""
client = vf.OpenAIClient()
prompt = f"Rate this answer (0-1): {completion}\nGround truth: {answer}"
response = await client.generate(
model="gpt-4",
prompt=prompt,
sampling_args={"temperature": 0}
)
score_text = response.message.content
return float(score_text.strip())
Group Reward Functions
Functions that score multiple rollouts together, enabling comparative evaluation.
Signature
def group_reward_fn(
# Plural versions of individual parameters:
prompts: list[Messages | str] = ...,
completions: list[Messages | str] = ...,
answers: list[Any] = ...,
tasks: list[str] = ...,
states: list[State] = ...,
infos: list[dict] = ...,
# Class objects (singular)
parser: vf.Parser = ...,
**kwargs
) -> list[float]:
"""Return list of scores, one per rollout."""
...
Examples
Relative Ranking
def rank_reward(completions: list[str], **kwargs) -> list[float]:
"""Reward top 50% of responses by length."""
lengths = [len(c) for c in completions]
median = sorted(lengths)[len(lengths) // 2]
return [1.0 if l >= median else 0.0 for l in lengths]
Best-of-N
def best_of_n(
completions: list[str],
answers: list[str],
**kwargs
) -> list[float]:
"""Give reward only to the best answer(s)."""
scores = [
1.0 if ans in comp else 0.0
for ans, comp in zip(answers, completions)
]
max_score = max(scores)
return [1.0 if s == max_score else 0.0 for s in scores]
Majority Voting
def majority_vote(
completions: list[str],
answer: str, # Singular - same for all
parser: vf.Parser,
**kwargs
) -> list[float]:
"""Reward answers that match majority."""
from collections import Counter
# Parse all answers
parsed = [parser.parse_answer(c) for c in completions]
# Find majority
counts = Counter(parsed)
majority_answer = counts.most_common(1)[0][0]
# Reward majority + correct answers
return [
1.0 if p == majority_answer or p == answer else 0.0
for p in parsed
]
Metrics vs Rewards
Reward functions can be used as metrics (tracked but not contributing to reward) by setting weight=0:
def response_length(completion: str, **kwargs) -> float:
"""Track response length as a metric."""
return float(len(completion))
rubric = vf.Rubric(
funcs=[exact_match, response_length],
weights=[1.0, 0.0], # length is a metric only
)
# Or use add_metric:
rubric.add_metric(response_length)
Async Support
Both individual and group functions can be async:
async def async_individual(completion: str, **kwargs) -> float:
result = await some_async_operation(completion)
return float(result)
async def async_group(completions: list[str], **kwargs) -> list[float]:
results = await asyncio.gather(*[
some_async_operation(c) for c in completions
])
return [float(r) for r in results]
Class Objects
Register objects that reward functions can access:
class CustomScorer:
def score(self, text: str) -> float:
return len(text) / 100
scorer = CustomScorer()
def use_scorer(completion: str, scorer: CustomScorer, **kwargs) -> float:
return scorer.score(completion)
rubric = vf.Rubric(funcs=[use_scorer])
rubric.add_class_object("scorer", scorer)
Debugging Rewards
Print intermediate values:
def debug_reward(answer: str, completion: str, state: vf.State, **kwargs) -> float:
print(f"Answer: {answer}")
print(f"Completion: {completion}")
print(f"Trajectory length: {len(state['trajectory'])}")
score = 1.0 if answer in completion else 0.0
print(f"Score: {score}")
return score
Common Patterns
Multi-criteria Scoring
def multi_criteria(
answer: str,
completion: str,
state: vf.State,
**kwargs
) -> float:
"""Combine multiple criteria."""
# Correctness
correct = 1.0 if answer in completion else 0.0
# Efficiency
turns = len(state["trajectory"])
efficiency = max(0.0, 1.0 - turns / 10)
# Length
length_score = min(len(completion) / 1000, 1.0)
# Weighted combination
return 0.7 * correct + 0.2 * efficiency + 0.1 * length_score
Partial Credit
def partial_credit(answer: str, completion: str, **kwargs) -> float:
"""Give partial credit for partial matches."""
answer_words = set(answer.lower().split())
completion_words = set(completion.lower().split())
overlap = len(answer_words & completion_words)
return overlap / len(answer_words) if answer_words else 0.0
Error Handling
def safe_reward(answer: str, completion: str, **kwargs) -> float:
"""Handle errors gracefully."""
try:
# Complex scoring logic that might fail
result = complex_calculation(answer, completion)
return float(result)
except Exception as e:
print(f"Reward computation failed: {e}")
return 0.0
Best Practices
- Always include
**kwargs for forward compatibility
- Return float (not int, bool, etc.) for rewards
- Handle None values gracefully
- Keep deterministic when possible (for reproducibility)
- Document score range in docstring
- Use async only when necessary (adds overhead)
- Validate inputs at function start
Type Checking
from verifiers.types import RewardFunc, State, Messages
def my_reward(
answer: str,
completion: Messages,
state: State,
**kwargs
) -> float:
"""Type-checked reward function."""
return 1.0
# Verify it's a valid RewardFunc
func: RewardFunc = my_reward
See Also
- Rubric - Combining multiple reward functions
- State - Full state dictionary
- Parser - Extracting answers from completions