Skip to main content

Overview

The Annotator class provides functionality for annotating documents with structured extractions using language models. It handles text chunking, prompt generation, batched inference, and extraction alignment.

Class Definition

from langextract.annotation import Annotator

Constructor

Annotator(
    language_model: BaseLanguageModel,
    prompt_template: PromptTemplateStructured,
    format_type: FormatType = FormatType.YAML,
    attribute_suffix: str = "_attributes",
    fence_output: bool = False,
    format_handler: FormatHandler | None = None
)
language_model
BaseLanguageModel
required
Model which performs language model inference.
prompt_template
PromptTemplateStructured
required
Structured prompt template where the answer is expected to be formatted text (YAML or JSON).
format_type
FormatType
default:"FormatType.YAML"
The format type for the output (YAML or JSON).
attribute_suffix
str
default:"_attributes"
Suffix to append to attribute keys in the output.
fence_output
bool
default:"False"
Whether to expect/generate fenced output (json or yaml). When True, the model is prompted to generate fenced output and the resolver expects it.
format_handler
FormatHandler | None
default:"None"
Optional FormatHandler for managing format-specific logic. If provided, it takes precedence over other format parameters.

Methods

annotate_documents()

Annotates a sequence of documents with NLP extractions.
def annotate_documents(
    self,
    documents: Iterable[data.Document],
    resolver: AbstractResolver | None = None,
    max_char_buffer: int = 200,
    batch_length: int = 1,
    debug: bool = True,
    extraction_passes: int = 1,
    context_window_chars: int | None = None,
    show_progress: bool = True,
    tokenizer: Tokenizer | None = None,
    **kwargs
) -> Iterator[AnnotatedDocument]
documents
Iterable[data.Document]
required
Documents to annotate. Each document is expected to have a unique document_id.
resolver
AbstractResolver | None
default:"None"
Resolver to use for extracting information from text. Defaults to Resolver(format_type=FormatType.YAML).
max_char_buffer
int
default:"200"
Max number of characters that we can run inference on. The text will be broken into chunks up to this length.
batch_length
int
default:"1"
Number of chunks to process in a single batch.
debug
bool
default:"True"
Whether to populate debug fields and print extraction summary.
extraction_passes
int
default:"1"
Number of sequential extraction attempts to improve recall. Values > 1 reprocess tokens multiple times, potentially increasing costs but finding more entities. Non-overlapping extractions from later passes are merged with earlier results.
context_window_chars
int | None
default:"None"
Number of characters from the previous chunk to include as context for the current chunk. Helps with coreference resolution across chunk boundaries.
show_progress
bool
default:"True"
Whether to show progress bar during processing.
tokenizer
Tokenizer | None
default:"None"
Optional tokenizer to use. If None, uses default tokenizer.
**kwargs
Any
Additional arguments passed to LanguageModel.infer() and Resolver.
return
Iterator[AnnotatedDocument]
Iterator yielding annotated documents with extractions.
Raises: ValueError if there are no scored outputs during inference.

annotate_text()

Annotates a single text string with NLP extractions.
def annotate_text(
    self,
    text: str,
    resolver: AbstractResolver | None = None,
    max_char_buffer: int = 200,
    batch_length: int = 1,
    additional_context: str | None = None,
    debug: bool = True,
    extraction_passes: int = 1,
    context_window_chars: int | None = None,
    show_progress: bool = True,
    tokenizer: Tokenizer | None = None,
    **kwargs
) -> AnnotatedDocument
text
str
required
Source text to annotate.
resolver
AbstractResolver | None
default:"None"
Resolver to use for extracting information from text.
max_char_buffer
int
default:"200"
Max number of characters per chunk.
batch_length
int
default:"1"
Number of chunks to process in a single batch.
additional_context
str | None
default:"None"
Additional context to supplement prompt instructions.
debug
bool
default:"True"
Whether to populate debug fields and print summary.
extraction_passes
int
default:"1"
Number of sequential extraction passes for improved recall.
context_window_chars
int | None
default:"None"
Number of characters from previous chunk to include as context.
show_progress
bool
default:"True"
Whether to show progress bar.
tokenizer
Tokenizer | None
default:"None"
Optional tokenizer instance.
**kwargs
Any
Additional arguments for inference and resolver.
return
AnnotatedDocument
Annotated document with extractions.

Usage Examples

Basic Document Annotation

from langextract.annotation import Annotator
from langextract.prompting import PromptTemplateStructured
from langextract.core.data import Document, ExampleData
from langextract import factory

# Create a prompt template
prompt = PromptTemplateStructured(
    description="Extract person names and organizations from the text.",
    examples=[
        ExampleData(
            text="Dr. Sarah Johnson works at Google.",
            extractions=[
                {"person": "Dr. Sarah Johnson", "person_index": 1},
                {"organization": "Google", "organization_index": 2}
            ]
        )
    ]
)

# Create model and annotator
model = factory.create_model("gemini-1.5-flash", api_key="your-key")
annotator = Annotator(model, prompt)

# Prepare documents
documents = [
    Document(
        document_id="doc1",
        text="John Smith founded Acme Corp in 2020."
    ),
    Document(
        document_id="doc2",
        text="Mary Jones is the CEO of TechStart."
    )
]

# Annotate documents
for annotated_doc in annotator.annotate_documents(documents):
    print(f"Document: {annotated_doc.document_id}")
    for extraction in annotated_doc.extractions:
        print(f"  {extraction.extraction_class}: {extraction.extraction_text}")

Single Text Annotation

from langextract.annotation import Annotator
from langextract.prompting import PromptTemplateStructured
from langextract import factory

prompt = PromptTemplateStructured(
    description="Extract medical conditions and treatments."
)

model = factory.create_model("gemini-1.5-flash", api_key="your-key")
annotator = Annotator(model, prompt)

text = "The patient was diagnosed with diabetes and prescribed metformin."
result = annotator.annotate_text(text)

for extraction in result.extractions:
    print(f"{extraction.extraction_class}: {extraction.extraction_text}")
    if extraction.char_interval:
        print(f"  Position: {extraction.char_interval.start_pos}-{extraction.char_interval.end_pos}")

Multi-Pass Extraction

# Use multiple extraction passes to improve recall
for annotated_doc in annotator.annotate_documents(
    documents,
    extraction_passes=3,  # Run extraction 3 times
    max_char_buffer=500,
    show_progress=True
):
    print(f"Found {len(annotated_doc.extractions)} extractions")

Context Window for Coreference Resolution

# Include previous chunk context for better entity linking
for annotated_doc in annotator.annotate_documents(
    documents,
    max_char_buffer=200,
    context_window_chars=100,  # Include 100 chars from previous chunk
):
    print(annotated_doc.extractions)

Batch Processing

# Process multiple chunks in parallel
for annotated_doc in annotator.annotate_documents(
    documents,
    batch_length=8,  # Process 8 chunks at once
    max_char_buffer=300
):
    print(annotated_doc.extractions)

Notes

  • Documents are automatically chunked based on max_char_buffer
  • Extractions are aligned to original document positions using token and character intervals
  • Use extraction_passes > 1 to improve recall at the cost of increased API calls
  • Set context_window_chars to help resolve pronouns and references across chunk boundaries
  • Progress bar shows real-time extraction statistics when show_progress=True
  • Each document must have a unique document_id to prevent processing errors
  • Extractions include alignment_status indicating match quality (MATCH_EXACT, MATCH_LESSER, MATCH_FUZZY)

Build docs developers (and LLMs) love