AlignedResult is a dataclass that represents the complete transcription result, containing the full text and a list of aligned sentences with timestamps.
Class Definition
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
# result is an AlignedResult object
print(type(result)) # <class 'parakeet_mlx.alignment.AlignedResult'>
Fields
The complete transcribed text. This is the concatenation of all sentence texts, automatically trimmed of leading/trailing whitespace.Example: "Hello, how are you? I'm doing great."
List of AlignedSentence objects, each containing text, timing information, and constituent tokens.Each sentence includes:
text: Sentence text
start: Start time in seconds
end: End time in seconds
duration: Duration in seconds
tokens: List of AlignedToken objects
confidence: Aggregate confidence score (0.0 to 1.0)
Properties
Read-only property that returns all tokens from all sentences in a flat list.This is a convenience method that flattens [token for sentence in sentences for token in sentence.tokens].Example:result = model.transcribe("audio.wav")
# Access all tokens at once
all_tokens = result.tokens
print(f"Total tokens: {len(all_tokens)}")
for token in all_tokens:
print(f"{token.start:.2f}s: {token.text}")
Examples
Basic Transcription
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
# Access the full text
print(result.text)
# Output: "Hello, how are you? I'm doing great."
# Access sentences
print(f"Number of sentences: {len(result.sentences)}")
for sentence in result.sentences:
print(f"[{sentence.start:.2f}s - {sentence.end:.2f}s] {sentence.text}")
Accessing Tokens
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
# Get all tokens from the property
all_tokens = result.tokens
print(f"Total tokens: {len(all_tokens)}")
for token in all_tokens:
print(f"{token.start:.2f}s - {token.end:.2f}s: '{token.text}' (confidence: {token.confidence:.2f})")
Working with Sentences
from parakeet_mlx import from_pretrained, DecodingConfig, SentenceConfig
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
# Configure sentence splitting
config = DecodingConfig(
sentence=SentenceConfig(
max_words=30,
silence_gap=5.0
)
)
result = model.transcribe("audio.wav", decoding_config=config)
# Iterate through sentences
for i, sentence in enumerate(result.sentences, 1):
print(f"\nSentence {i}:")
print(f" Time: {sentence.start:.2f}s - {sentence.end:.2f}s ({sentence.duration:.2f}s)")
print(f" Confidence: {sentence.confidence:.2%}")
print(f" Text: {sentence.text}")
print(f" Tokens: {len(sentence.tokens)}")
Generating Subtitles
from parakeet_mlx import from_pretrained, DecodingConfig, SentenceConfig
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
config = DecodingConfig(
sentence=SentenceConfig(max_words=15, max_duration=5.0)
)
result = model.transcribe("audio.wav", decoding_config=config)
# Generate SRT-style output
for i, sentence in enumerate(result.sentences, 1):
start_time = format_timestamp(sentence.start)
end_time = format_timestamp(sentence.end)
print(f"{i}")
print(f"{start_time} --> {end_time}")
print(f"{sentence.text}")
print()
def format_timestamp(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millis = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
Batch Processing
from parakeet_mlx import from_pretrained
from parakeet_mlx.audio import load_audio, get_logmel
import mlx.core as mx
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
# Process multiple audio files
audio_files = ["audio1.wav", "audio2.wav", "audio3.wav"]
# Load and preprocess
audio_data = [
load_audio(f, model.preprocessor_config.sample_rate)
for f in audio_files
]
mels = [get_logmel(audio, model.preprocessor_config) for audio in audio_data]
# Batch process
mel_batch = mx.concatenate(mels, axis=0)
results = model.generate(mel_batch) # Returns list[AlignedResult]
# Process each result
for filename, result in zip(audio_files, results):
print(f"\n{filename}:")
print(f" {result.text}")
print(f" Sentences: {len(result.sentences)}")
print(f" Tokens: {len(result.tokens)}")
JSON Export
Export transcription results to JSON:
import json
from dataclasses import asdict
from parakeet_mlx import from_pretrained
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
result = model.transcribe("audio.wav")
# Convert to dictionary
result_dict = asdict(result)
# Save to JSON
with open("transcription.json", "w") as f:
json.dump(result_dict, f, indent=2)
print(json.dumps(result_dict, indent=2))