Real-time audio transcription with streaming inference
Streaming transcription allows you to transcribe audio in real-time as it’s being captured, perfect for live applications like voice assistants, live captioning, or real-time meeting transcription.
Parakeet MLX provides streaming inference through the transcribe_stream() context manager, which processes audio chunks incrementally while maintaining context across chunks.
from parakeet_mlx import from_pretrainedmodel = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")
2
Create streaming context
with model.transcribe_stream(context_size=(256, 256)) as transcriber: # Add audio chunks here pass
3
Add audio chunks
with model.transcribe_stream(context_size=(256, 256)) as transcriber: # Process audio chunks transcriber.add_audio(audio_chunk) # Get current result result = transcriber.result print(result.text)
Here’s a complete example that simulates real-time streaming:
import mlx.core as mxfrom parakeet_mlx import from_pretrainedfrom parakeet_mlx.audio import load_audio# Load modelmodel = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")# Create streaming contextwith model.transcribe_stream( context_size=(256, 256), # (left_context, right_context) frames depth=1, # Cache depth) as transcriber: # Load audio (in practice, this would come from a microphone) audio_data = load_audio( "audio_file.wav", model.preprocessor_config.sample_rate ) # Process in 1-second chunks chunk_size = model.preprocessor_config.sample_rate # 1 second for i in range(0, len(audio_data), chunk_size): chunk = audio_data[i:i + chunk_size] # Add audio chunk transcriber.add_audio(chunk) # Get current transcription result = transcriber.result # Display update print(f"\rCurrent: {result.text}", end="") # Final result print(f"\n\nFinal: {result.text}")
A tuple of (left_context, right_context) specifying the attention window size in encoder frames.
left_context: How many frames to look back
right_context: How many frames to look ahead
Larger values provide more context but increase memory usage.
# Small context (lower latency, less accuracy)with model.transcribe_stream(context_size=(128, 128)) as transcriber: pass# Large context (higher latency, more accuracy)with model.transcribe_stream(context_size=(512, 512)) as transcriber: pass
Number of encoder layers that preserve exact computation across chunks.
depth=1: Only first encoder layer matches non-streaming computation exactly
depth=2: First two layers match exactly
depth=N: Full equivalence to non-streaming forward pass (where N is total layers)
Higher depth improves consistency with non-streaming mode but uses more memory.
# Minimal cache (lower memory)with model.transcribe_stream(depth=1) as transcriber: pass# More consistent with non-streaming (higher memory)with model.transcribe_stream(depth=4) as transcriber: pass
Whether to preserve the original attention mechanism.
False: Switches to local attention (recommended for streaming)
True: Keeps original attention (less suitable for streaming)
# Use local attention (recommended)with model.transcribe_stream(keep_original_attention=False) as transcriber: pass# Keep original attentionwith model.transcribe_stream(keep_original_attention=True) as transcriber: pass
with model.transcribe_stream() as transcriber: transcriber.add_audio(audio_chunk) # Get current result (AlignedResult) result = transcriber.result print(result.text) print(result.sentences)
with model.transcribe_stream() as transcriber: transcriber.add_audio(audio_chunk) # Get finalized tokens (list[AlignedToken]) finalized = transcriber.finalized_tokens for token in finalized: print(f"{token.text} [{token.start:.2f}s]")
with model.transcribe_stream() as transcriber: transcriber.add_audio(audio_chunk) # Get draft tokens (list[AlignedToken]) draft = transcriber.draft_tokens for token in draft: print(f"[DRAFT] {token.text}")
Draft tokens provide a preview of what might come next but may change as more audio is processed.
model = from_pretrained("mlx-community/parakeet-tdt-0.6b-v3")with model.transcribe_stream(context_size=(256, 256)) as transcriber: for chunk in audio_chunks: transcriber.add_audio(chunk) # Get finalized tokens with timestamps for token in transcriber.finalized_tokens: print(f"{token.start:.2f}s: {token.text}")