Overview
WhisperKit supports real-time streaming transcription from a microphone, allowing you to transcribe audio as it’s being spoken.
Quick Start
Stream from Microphone
# Using the CLI
swift run whisperkit-cli transcribe \
--model-path "Models/whisperkit-coreml/openai_whisper-large-v3" \
--stream
This will start recording from your microphone and display transcriptions in real-time.
Building a Streaming App
Here’s how to implement real-time streaming in your own app:
1. Initialize WhisperKit
import WhisperKit
import AVFoundation
class TranscriptionViewModel : ObservableObject {
@Published var transcribedText = ""
@Published var isRecording = false
private var whisperKit: WhisperKit ?
func loadModel () async {
do {
let config = WhisperKitConfig (
model : "large-v3" ,
computeOptions : ModelComputeOptions (
audioEncoderCompute : . cpuAndNeuralEngine ,
textDecoderCompute : . cpuAndNeuralEngine
)
)
whisperKit = try await WhisperKit (config)
} catch {
print ( "Failed to load model: \( error ) " )
}
}
}
2. Start Recording and Streaming
func startRecording () {
guard let whisperKit = whisperKit else { return }
// Start audio recording
Task {
do {
try whisperKit. audioProcessor . startRecording ()
isRecording = true
// Start transcription loop
await transcribeStream ()
} catch {
print ( "Failed to start recording: \( error ) " )
}
}
}
func stopRecording () {
whisperKit ? . audioProcessor . stopRecording ()
isRecording = false
}
3. Process Audio Stream
func transcribeStream () async {
guard let whisperKit = whisperKit else { return }
while isRecording {
do {
// Get current audio buffer
guard let audioBuffer = whisperKit.audioProcessor.audioSamples else {
continue
}
// Transcribe the buffer
if let result = try await whisperKit. transcribe (
audioArray : audioBuffer
) {
await MainActor. run {
transcribedText = result. text
}
}
// Wait before next transcription
try await Task. sleep ( nanoseconds : 1_000_000_000 ) // 1 second
} catch {
print ( "Transcription error: \( error ) " )
}
}
}
Advanced Streaming Features
Voice Activity Detection (VAD)
Optimize streaming by detecting when speech is present:
// Configure VAD settings
var decodingOptions = DecodingOptions ()
decodingOptions. silenceThreshold = 0.3
decodingOptions. useVAD = true
let result = try await whisperKit. transcribe (
audioArray : audioBuffer,
decodeOptions : decodingOptions
)
Eager Decoding Mode
Get even faster updates with eager streaming mode:
var decodingOptions = DecodingOptions ()
decodingOptions. enableEagerDecoding = true
decodingOptions. tokenConfirmationsNeeded = 2
let result = try await whisperKit. transcribe (
audioArray : audioBuffer,
decodeOptions : decodingOptions
)
Eager mode provides faster updates but may produce less accurate intermediate results.
Segment Confirmation
Track confirmed vs. unconfirmed segments:
if let result = try await whisperKit. transcribe ( audioArray : audioBuffer) {
// Confirmed segments (finalized)
for segment in result.confirmedSegments {
print ( "✓ [ \( segment. start ) s]: \( segment. text ) " )
}
// Unconfirmed segments (still being processed)
for segment in result.unconfirmedSegments {
print ( "⋯ [ \( segment. start ) s]: \( segment. text ) " )
}
}
Complete Streaming Example
Here’s a full SwiftUI example with visualization:
import SwiftUI
import WhisperKit
struct StreamingView : View {
@StateObject private var viewModel = StreamingViewModel ()
var body: some View {
VStack {
// Audio waveform visualization
WaveformView ( energyLevels : viewModel. bufferEnergy )
. frame ( height : 100 )
// Transcription display
ScrollView {
VStack ( alignment : . leading , spacing : 8 ) {
// Confirmed text (bold)
Text (viewModel. confirmedText )
. fontWeight (. bold )
// Hypothesis text (gray)
Text (viewModel. hypothesisText )
. foregroundColor (. gray )
}
. padding ()
}
// Controls
HStack {
Button ( action : {
if viewModel.isRecording {
viewModel. stopRecording ()
} else {
viewModel. startRecording ()
}
}) {
Image ( systemName : viewModel. isRecording ? "stop.circle.fill" : "record.circle" )
. resizable ()
. frame ( width : 60 , height : 60 )
. foregroundColor (. red )
}
Text ( String ( format : "%.1fs" , viewModel. recordingDuration ))
. font (. caption )
}
. padding ()
}
. task {
await viewModel. loadModel ()
}
}
}
@MainActor
class StreamingViewModel : ObservableObject {
@Published var confirmedText = ""
@Published var hypothesisText = ""
@Published var isRecording = false
@Published var bufferEnergy: [ Float ] = []
@Published var recordingDuration: TimeInterval = 0
private var whisperKit: WhisperKit ?
private var recordingTask: Task< Void , Never > ?
func loadModel () async {
do {
whisperKit = try await WhisperKit (
WhisperKitConfig ( model : "large-v3" )
)
} catch {
print ( "Model load error: \( error ) " )
}
}
func startRecording () {
guard let whisperKit = whisperKit else { return }
recordingTask = Task {
do {
try whisperKit. audioProcessor . startRecording ()
isRecording = true
await transcriptionLoop ()
} catch {
print ( "Recording error: \( error ) " )
}
}
}
func stopRecording () {
recordingTask ? . cancel ()
whisperKit ? . audioProcessor . stopRecording ()
isRecording = false
recordingDuration = 0
}
private func transcriptionLoop () async {
guard let whisperKit = whisperKit else { return }
var confirmedWords: [WordTiming] = []
var startTime = Date ()
while ! Task.isCancelled && isRecording {
// Update duration
recordingDuration = Date (). timeIntervalSince (startTime)
// Get audio buffer and energy levels
if let audioSamples = whisperKit.audioProcessor.audioSamples {
bufferEnergy = whisperKit. audioProcessor . relativeEnergy ?? []
// Transcribe
if let result = try ? await whisperKit. transcribe (
audioArray : audioSamples
) {
// Update confirmed and hypothesis text
if let words = result.allWords {
let newConfirmed = words. filter { $0 . probability > 0.5 }
confirmedWords = newConfirmed
confirmedText = newConfirmed. map { $0 . word }. joined ()
let hypothesis = words. filter { $0 . probability <= 0.5 }
hypothesisText = hypothesis. map { $0 . word }. joined ()
}
}
}
// Wait before next iteration
try ? await Task. sleep ( for : . seconds ( 1 ))
}
}
}
Audio Device Selection (macOS)
On macOS, you can select which audio input device to use:
import WhisperKit
// Get available audio devices
let devices = AudioProcessor. getAudioDevices ()
for device in devices {
print ( "Device: \( device. name ) " )
}
// Start recording from a specific device
let selectedDevice = devices. first { $0 . name . contains ( "Microphone" ) }
if let device = selectedDevice {
try whisperKit. audioProcessor . startRecording (
inputDeviceID : device. id
)
}
Set computeOptions.audioEncoderCompute and textDecoderCompute to .cpuAndNeuralEngine for best performance.
Tune the audio buffer size and chunking strategy for your use case: decodingOptions. chunkingStrategy = . vad
decodingOptions. sampleLength = 224
Lower fallbackCount to reduce latency at the cost of accuracy: decodingOptions. temperatureFallbackCount = 3
Next Steps
Basic Transcription Learn the basics of file-based transcription
Local Server Set up a server for non-Swift clients