Skip to main content

Overview

The chunking module provides functionality for breaking large documents into smaller chunks that fit within language model context windows. It handles sentence boundaries, newlines, and token limits intelligently.

Module

from langextract import chunking

Classes

TextChunk

Stores a text chunk with attributes linking it to the source document.
@dataclasses.dataclass
class TextChunk:
    token_interval: tokenizer_lib.TokenInterval
    document: data.Document | None = None
Attributes:
token_interval
TokenInterval
required
The token interval of the chunk in the source document.
document
Document | None
default:"None"
The source document.
Properties:
  • document_id: Gets the document ID from the source document
  • document_text: Gets the tokenized text from the source document
  • chunk_text: Gets the chunk text (raises ValueError if document_text not set)
  • sanitized_chunk_text: Gets sanitized chunk text with normalized whitespace
  • additional_context: Gets additional context for prompting from the source document
  • char_interval: Gets the character interval corresponding to the token interval

ChunkIterator

Iterates through chunks of a tokenized text.
class ChunkIterator:
    def __init__(
        self,
        text: str | tokenizer_lib.TokenizedText | None,
        max_char_buffer: int,
        tokenizer_impl: tokenizer_lib.Tokenizer,
        document: data.Document | None = None
    )
text
str | TokenizedText | None
required
Document to chunk. Can be a string or tokenized text.
max_char_buffer
int
required
Size of buffer that we can run inference on (in characters).
tokenizer_impl
Tokenizer
required
Tokenizer instance to use for tokenization.
document
Document | None
default:"None"
Optional source document.
Behavior: Chunks are created intelligently:
  • Case A: If a sentence fits within max_char_buffer, multiple sentences are combined
  • Case B: If a sentence exceeds max_char_buffer, it’s split at newlines or token boundaries
  • Case C: If a single token exceeds max_char_buffer, it comprises the whole chunk

SentenceIterator

Iterates through sentences of a tokenized text.
class SentenceIterator:
    def __init__(
        self,
        tokenized_text: tokenizer_lib.TokenizedText,
        curr_token_pos: int = 0
    )
tokenized_text
TokenizedText
required
Document to iterate through.
curr_token_pos
int
default:"0"
Iterate through sentences from this token position.

Functions

make_batches_of_textchunk()

Processes chunks into batches for inference.
def make_batches_of_textchunk(
    chunk_iter: Iterator[TextChunk],
    batch_length: int
) -> Iterable[Sequence[TextChunk]]
chunk_iter
Iterator[TextChunk]
required
Iterator of TextChunks.
batch_length
int
required
Number of chunks to include in each batch.
return
Iterable[Sequence[TextChunk]]
Batches of TextChunks ready for batched inference.

get_token_interval_text()

Gets the text within an interval of tokens.
def get_token_interval_text(
    tokenized_text: tokenizer_lib.TokenizedText,
    token_interval: tokenizer_lib.TokenInterval
) -> str
tokenized_text
TokenizedText
required
Tokenized document.
token_interval
TokenInterval
required
An interval specifying the start (inclusive) and end (exclusive) indices of tokens to extract.
return
str
Text within the token interval.
Raises: ValueError if token indices are invalid; TokenUtilError if tokenizer returns empty string.

get_char_interval()

Returns the character interval corresponding to a token interval.
def get_char_interval(
    tokenized_text: tokenizer_lib.TokenizedText,
    token_interval: tokenizer_lib.TokenInterval
) -> data.CharInterval
tokenized_text
TokenizedText
required
Document.
token_interval
TokenInterval
required
Token interval.
return
CharInterval
Character interval of the token interval of interest.
Raises: ValueError if token_interval is invalid.

create_token_interval()

Creates a token interval.
def create_token_interval(
    start_index: int,
    end_index: int
) -> tokenizer_lib.TokenInterval
start_index
int
required
First token’s index (inclusive).
end_index
int
required
Last token’s index + 1 (exclusive).
return
TokenInterval
Token interval object.
Raises: ValueError if token indices are invalid.

Usage Examples

Basic Text Chunking

from langextract.chunking import ChunkIterator
from langextract.core.tokenizer import RegexTokenizer
from langextract.core.data import Document

text = """This is a long document. It has multiple sentences. 
We need to break it into chunks that fit within our context window.
Each chunk should respect sentence boundaries when possible."""

tokenizer = RegexTokenizer()
document = Document(document_id="doc1", text=text)

chunk_iter = ChunkIterator(
    text=text,
    max_char_buffer=100,
    tokenizer_impl=tokenizer,
    document=document
)

for i, chunk in enumerate(chunk_iter):
    print(f"Chunk {i + 1}:")
    print(f"  Text: {chunk.chunk_text}")
    print(f"  Token interval: {chunk.token_interval.start_index}-{chunk.token_interval.end_index}")
    print(f"  Char interval: {chunk.char_interval.start_pos}-{chunk.char_interval.end_pos}")

Batching Chunks

from langextract.chunking import ChunkIterator, make_batches_of_textchunk
from langextract.core.tokenizer import RegexTokenizer

text = "This is sentence one. This is sentence two. This is sentence three."
tokenizer = RegexTokenizer()

chunk_iter = ChunkIterator(
    text=text,
    max_char_buffer=50,
    tokenizer_impl=tokenizer
)

batches = make_batches_of_textchunk(chunk_iter, batch_length=2)

for batch_num, batch in enumerate(batches):
    print(f"Batch {batch_num + 1}: {len(batch)} chunks")
    for chunk in batch:
        print(f"  - {chunk.chunk_text}")

Working with Token Intervals

from langextract.chunking import create_token_interval, get_token_interval_text
from langextract.core.tokenizer import RegexTokenizer

tokenizer = RegexTokenizer()
tokenized_text = tokenizer.tokenize("The quick brown fox jumps.")

# Create an interval for tokens 1-3
token_interval = create_token_interval(start_index=1, end_index=4)

# Extract text for this interval
text = get_token_interval_text(tokenized_text, token_interval)
print(f"Extracted text: {text}")

Sentence Iteration

from langextract.chunking import SentenceIterator
from langextract.core.tokenizer import RegexTokenizer

text = "First sentence. Second sentence. Third sentence."
tokenizer = RegexTokenizer()
tokenized_text = tokenizer.tokenize(text)

sentence_iter = SentenceIterator(tokenized_text)

for sentence_interval in sentence_iter:
    sentence_text = get_token_interval_text(tokenized_text, sentence_interval)
    print(f"Sentence: {sentence_text}")

Handling Long Tokens

from langextract.chunking import ChunkIterator
from langextract.core.tokenizer import RegexTokenizer

# Text with a very long word
text = "This is antidisestablishmentarianism."

tokenizer = RegexTokenizer()
chunk_iter = ChunkIterator(
    text=text,
    max_char_buffer=20,  # Small buffer
    tokenizer_impl=tokenizer
)

for chunk in chunk_iter:
    print(f"Chunk: '{chunk.chunk_text}' (length: {len(chunk.chunk_text)})")
# Output:
# Chunk: 'This is' (length: 7)
# Chunk: 'antidisestablishmentarianism' (length: 28)  # Exceeds buffer
# Chunk: '.' (length: 1)

Respecting Newlines

from langextract.chunking import ChunkIterator
from langextract.core.tokenizer import RegexTokenizer

text = """No man is an island,
Entire of itself,
Every man is a piece of the continent,
A part of the main."""

tokenizer = RegexTokenizer()
chunk_iter = ChunkIterator(
    text=text,
    max_char_buffer=40,
    tokenizer_impl=tokenizer
)

for i, chunk in enumerate(chunk_iter):
    print(f"Chunk {i + 1}: {chunk.chunk_text!r}")
# Chunks will break at newlines when possible

Notes

  • ChunkIterator intelligently handles sentence boundaries and newlines
  • Chunks respect max_char_buffer while maximizing chunk size
  • Single tokens that exceed the buffer are included as standalone chunks
  • Token intervals use 0-based indexing with exclusive end indices
  • Character intervals map chunks back to original document positions
  • Use sanitized_chunk_text to get whitespace-normalized text
  • The chunking algorithm prioritizes keeping sentences intact when possible

Build docs developers (and LLMs) love