AlignedSentence is a dataclass that represents a single sentence in the transcription, with timing information and constituent tokens.
Class Definition
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
# Access sentences from the result
for sentence in result.sentences:
print(sentence.text)
print(f"Time: {sentence.start:.2f}s - {sentence.end:.2f}s")
Fields
The text content of the sentence. This is constructed by concatenating all token texts.Example: "Hello, how are you?"
List of AlignedToken objects that make up this sentence. Tokens are automatically sorted by start time.Each token includes:
id: Token ID in the vocabulary
text: Token text
start: Start time in seconds
end: End time in seconds
duration: Duration in seconds
confidence: Confidence score (0.0 to 1.0)
Start time of the sentence in seconds. Automatically computed as the start time of the first token.Example: 1.23
End time of the sentence in seconds. Automatically computed as the end time of the last token.Example: 4.56
Duration of the sentence in seconds. Automatically computed as end - start.Example: 3.33
Aggregate confidence score for the sentence (0.0 to 1.0). Computed as the geometric mean of all token confidence scores.Formula: exp(mean(log(token_confidences)))Example: 0.87
Automatic Computation
When an AlignedSentence is created, the following fields are automatically computed in __post_init__:
- tokens: Sorted by start time
- start: Set to first token’s start time
- end: Set to last token’s end time
- duration: Computed as
end - start
- confidence: Computed as geometric mean of token confidences
You should not manually set these values; they are derived from the tokens.
Examples
Basic Usage
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
for sentence in result.sentences:
print(f"Text: {sentence.text}")
print(f"Time: {sentence.start:.2f}s - {sentence.end:.2f}s")
print(f"Duration: {sentence.duration:.2f}s")
print(f"Confidence: {sentence.confidence:.2%}")
print(f"Tokens: {len(sentence.tokens)}")
print()
Analyzing Token-Level Details
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
for i, sentence in enumerate(result.sentences, 1):
print(f"\nSentence {i}: {sentence.text}")
print(f" Overall confidence: {sentence.confidence:.2%}")
print(f" Token breakdown:")
for token in sentence.tokens:
print(f" {token.start:.2f}s: '{token.text}' (conf: {token.confidence:.2%})")
Filter by Confidence
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
# Find sentences with low confidence
low_confidence_threshold = 0.7
print("Low confidence sentences:")
for sentence in result.sentences:
if sentence.confidence < low_confidence_threshold:
print(f" [{sentence.start:.2f}s] Confidence: {sentence.confidence:.2%}")
print(f" Text: {sentence.text}")
print()
Generate Time-Stamped Transcript
from parakeet_mlx import from_pretrained, DecodingConfig, SentenceConfig
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
config = DecodingConfig(
sentence=SentenceConfig(silence_gap=3.0)
)
result = model.transcribe("audio.wav", decoding_config=config)
print("=== Time-Stamped Transcript ===")
for sentence in result.sentences:
timestamp = f"[{sentence.start:.1f}s - {sentence.end:.1f}s]"
print(f"{timestamp:20} {sentence.text}")
Word Highlighting for Subtitles
from parakeet_mlx import from_pretrained, DecodingConfig, SentenceConfig
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
config = DecodingConfig(
sentence=SentenceConfig(max_words=15, max_duration=5.0)
)
result = model.transcribe("audio.wav", decoding_config=config)
print("=== VTT with Word Highlighting ===")
for i, sentence in enumerate(result.sentences, 1):
print(f"{i}")
print(f"{format_vtt_time(sentence.start)} --> {format_vtt_time(sentence.end)}")
# Add word-level timestamps
for token in sentence.tokens:
print(f"<{format_vtt_time(token.start)}><c>{token.text}</c>", end="")
print("\n")
def format_vtt_time(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = seconds % 60
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
Search Within Time Range
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
def search_in_timerange(sentences, start_time, end_time, query):
"""Search for text within a specific time range"""
results = []
for sentence in sentences:
# Check if sentence overlaps with time range
if sentence.start <= end_time and sentence.end >= start_time:
if query.lower() in sentence.text.lower():
results.append(sentence)
return results
# Search for "machine learning" between 1:00 and 2:00
matches = search_in_timerange(result.sentences, 60.0, 120.0, "machine learning")
for sentence in matches:
print(f"Found at {sentence.start:.1f}s: {sentence.text}")
Calculate Speaking Statistics
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
# Calculate statistics
total_duration = sum(s.duration for s in result.sentences)
avg_duration = total_duration / len(result.sentences)
avg_confidence = sum(s.confidence for s in result.sentences) / len(result.sentences)
# Count words
total_words = sum(
len([t for t in s.tokens if " " in t.text])
for s in result.sentences
)
print(f"Total sentences: {len(result.sentences)}")
print(f"Total speaking time: {total_duration:.2f}s")
print(f"Average sentence duration: {avg_duration:.2f}s")
print(f"Average confidence: {avg_confidence:.2%}")
print(f"Total words: {total_words}")
print(f"Words per minute: {(total_words / total_duration * 60):.1f}")