annotate_documents

Overview

The Annotator class provides functionality for annotating documents with structured extractions using language models. It handles text chunking, prompt generation, batched inference, and extraction alignment.

Class Definition

from langextract.annotation import Annotator

Constructor

Annotator(
    language_model: BaseLanguageModel,
    prompt_template: PromptTemplateStructured,
    format_type: FormatType = FormatType.YAML,
    attribute_suffix: str = "_attributes",
    fence_output: bool = False,
    format_handler: FormatHandler | None = None
)

language_model

BaseLanguageModel

required

Model which performs language model inference.

prompt_template

PromptTemplateStructured

required

Structured prompt template where the answer is expected to be formatted text (YAML or JSON).

format_type

FormatType

default:"FormatType.YAML"

The format type for the output (YAML or JSON).

attribute_suffix

str

default:"_attributes"

Suffix to append to attribute keys in the output.

fence_output

bool

default:"False"

Whether to expect/generate fenced output (json or yaml). When True, the model is prompted to generate fenced output and the resolver expects it.

format_handler

FormatHandler | None

default:"None"

Optional FormatHandler for managing format-specific logic. If provided, it takes precedence over other format parameters.

Methods

annotate_documents()

Annotates a sequence of documents with NLP extractions.

def annotate_documents(
    self,
    documents: Iterable[data.Document],
    resolver: AbstractResolver | None = None,
    max_char_buffer: int = 200,
    batch_length: int = 1,
    debug: bool = True,
    extraction_passes: int = 1,
    context_window_chars: int | None = None,
    show_progress: bool = True,
    tokenizer: Tokenizer | None = None,
    **kwargs
) -> Iterator[AnnotatedDocument]

documents

Iterable[data.Document]

required

Documents to annotate. Each document is expected to have a unique document_id.

resolver

AbstractResolver | None

default:"None"

Resolver to use for extracting information from text. Defaults to Resolver(format_type=FormatType.YAML).

max_char_buffer

int

default:"200"

Max number of characters that we can run inference on. The text will be broken into chunks up to this length.

batch_length

int

default:"1"

Number of chunks to process in a single batch.

debug

bool

default:"True"

Whether to populate debug fields and print extraction summary.

extraction_passes

int

default:"1"

Number of sequential extraction attempts to improve recall. Values > 1 reprocess tokens multiple times, potentially increasing costs but finding more entities. Non-overlapping extractions from later passes are merged with earlier results.

context_window_chars

int | None

default:"None"

Number of characters from the previous chunk to include as context for the current chunk. Helps with coreference resolution across chunk boundaries.

show_progress

bool

default:"True"

Whether to show progress bar during processing.

tokenizer

Tokenizer | None

default:"None"

Optional tokenizer to use. If None, uses default tokenizer.

**kwargs

Any

Additional arguments passed to LanguageModel.infer() and Resolver.

return

Iterator[AnnotatedDocument]

Iterator yielding annotated documents with extractions.

Raises: ValueError if there are no scored outputs during inference.

annotate_text()

Annotates a single text string with NLP extractions.

def annotate_text(
    self,
    text: str,
    resolver: AbstractResolver | None = None,
    max_char_buffer: int = 200,
    batch_length: int = 1,
    additional_context: str | None = None,
    debug: bool = True,
    extraction_passes: int = 1,
    context_window_chars: int | None = None,
    show_progress: bool = True,
    tokenizer: Tokenizer | None = None,
    **kwargs
) -> AnnotatedDocument

text

str

required

Source text to annotate.

resolver

AbstractResolver | None

default:"None"

Resolver to use for extracting information from text.

max_char_buffer

int

default:"200"

Max number of characters per chunk.

batch_length

int

default:"1"

Number of chunks to process in a single batch.

additional_context

str | None

default:"None"

Additional context to supplement prompt instructions.

debug

bool

default:"True"

Whether to populate debug fields and print summary.

extraction_passes

int

default:"1"

Number of sequential extraction passes for improved recall.

context_window_chars

int | None

default:"None"

Number of characters from previous chunk to include as context.

show_progress

bool

default:"True"

Whether to show progress bar.

tokenizer

Tokenizer | None

default:"None"

Optional tokenizer instance.

**kwargs

Any

Additional arguments for inference and resolver.

return

AnnotatedDocument

Annotated document with extractions.

Usage Examples

Basic Document Annotation

from langextract.annotation import Annotator
from langextract.prompting import PromptTemplateStructured
from langextract.core.data import Document, ExampleData
from langextract import factory

# Create a prompt template
prompt = PromptTemplateStructured(
    description="Extract person names and organizations from the text.",
    examples=[
        ExampleData(
            text="Dr. Sarah Johnson works at Google.",
            extractions=[
                {"person": "Dr. Sarah Johnson", "person_index": 1},
                {"organization": "Google", "organization_index": 2}
            ]
        )
    ]
)

# Create model and annotator
model = factory.create_model("gemini-1.5-flash", api_key="your-key")
annotator = Annotator(model, prompt)

# Prepare documents
documents = [
    Document(
        document_id="doc1",
        text="John Smith founded Acme Corp in 2020."
    ),
    Document(
        document_id="doc2",
        text="Mary Jones is the CEO of TechStart."
    )
]

# Annotate documents
for annotated_doc in annotator.annotate_documents(documents):
    print(f"Document: {annotated_doc.document_id}")
    for extraction in annotated_doc.extractions:
        print(f"  {extraction.extraction_class}: {extraction.extraction_text}")

Single Text Annotation

from langextract.annotation import Annotator
from langextract.prompting import PromptTemplateStructured
from langextract import factory

prompt = PromptTemplateStructured(
    description="Extract medical conditions and treatments."
)

model = factory.create_model("gemini-1.5-flash", api_key="your-key")
annotator = Annotator(model, prompt)

text = "The patient was diagnosed with diabetes and prescribed metformin."
result = annotator.annotate_text(text)

for extraction in result.extractions:
    print(f"{extraction.extraction_class}: {extraction.extraction_text}")
    if extraction.char_interval:
        print(f"  Position: {extraction.char_interval.start_pos}-{extraction.char_interval.end_pos}")

Multi-Pass Extraction

# Use multiple extraction passes to improve recall
for annotated_doc in annotator.annotate_documents(
    documents,
    extraction_passes=3,  # Run extraction 3 times
    max_char_buffer=500,
    show_progress=True
):
    print(f"Found {len(annotated_doc.extractions)} extractions")

Context Window for Coreference Resolution

# Include previous chunk context for better entity linking
for annotated_doc in annotator.annotate_documents(
    documents,
    max_char_buffer=200,
    context_window_chars=100,  # Include 100 chars from previous chunk
):
    print(annotated_doc.extractions)

Batch Processing

# Process multiple chunks in parallel
for annotated_doc in annotator.annotate_documents(
    documents,
    batch_length=8,  # Process 8 chunks at once
    max_char_buffer=300
):
    print(annotated_doc.extractions)

Notes

Documents are automatically chunked based on max_char_buffer
Extractions are aligned to original document positions using token and character intervals
Use extraction_passes > 1 to improve recall at the cost of increased API calls
Set context_window_chars to help resolve pronouns and references across chunk boundaries
Progress bar shows real-time extraction statistics when show_progress=True
Each document must have a unique document_id to prevent processing errors
Extractions include alignment_status indicating match quality (MATCH_EXACT, MATCH_LESSER, MATCH_FUZZY)

Core API

Data Classes

I/O Operations

Factory & Configuration

Provider API

Advanced

Overview

Class Definition

Constructor

Methods

annotate_documents()

annotate_text()

Usage Examples

Basic Document Annotation

Single Text Annotation

Multi-Pass Extraction

Context Window for Coreference Resolution

Batch Processing

Notes

Build docs developers (and LLMs) love

Core API

Data Classes

I/O Operations

Factory & Configuration

Provider API

Advanced

​Overview

​Class Definition

​Constructor

​Methods

​annotate_documents()

​annotate_text()

​Usage Examples

​Basic Document Annotation

​Single Text Annotation

​Multi-Pass Extraction

​Context Window for Coreference Resolution

​Batch Processing

​Notes

Build docs developers (and LLMs) love

Overview

Class Definition

Constructor

Methods

annotate_documents()

annotate_text()

Usage Examples

Basic Document Annotation

Single Text Annotation

Multi-Pass Extraction

Context Window for Coreference Resolution

Batch Processing

Notes