Skip to main content

Overview

WhisperKit supports real-time streaming transcription from a microphone, allowing you to transcribe audio as it’s being spoken.

Quick Start

Stream from Microphone

# Using the CLI
swift run whisperkit-cli transcribe \
    --model-path "Models/whisperkit-coreml/openai_whisper-large-v3" \
    --stream
This will start recording from your microphone and display transcriptions in real-time.

Building a Streaming App

Here’s how to implement real-time streaming in your own app:

1. Initialize WhisperKit

import WhisperKit
import AVFoundation

class TranscriptionViewModel: ObservableObject {
    @Published var transcribedText = ""
    @Published var isRecording = false
    
    private var whisperKit: WhisperKit?
    
    func loadModel() async {
        do {
            let config = WhisperKitConfig(
                model: "large-v3",
                computeOptions: ModelComputeOptions(
                    audioEncoderCompute: .cpuAndNeuralEngine,
                    textDecoderCompute: .cpuAndNeuralEngine
                )
            )
            whisperKit = try await WhisperKit(config)
        } catch {
            print("Failed to load model: \(error)")
        }
    }
}

2. Start Recording and Streaming

func startRecording() {
    guard let whisperKit = whisperKit else { return }
    
    // Start audio recording
    Task {
        do {
            try whisperKit.audioProcessor.startRecording()
            isRecording = true
            
            // Start transcription loop
            await transcribeStream()
        } catch {
            print("Failed to start recording: \(error)")
        }
    }
}

func stopRecording() {
    whisperKit?.audioProcessor.stopRecording()
    isRecording = false
}

3. Process Audio Stream

func transcribeStream() async {
    guard let whisperKit = whisperKit else { return }
    
    while isRecording {
        do {
            // Get current audio buffer
            guard let audioBuffer = whisperKit.audioProcessor.audioSamples else {
                continue
            }
            
            // Transcribe the buffer
            if let result = try await whisperKit.transcribe(
                audioArray: audioBuffer
            ) {
                await MainActor.run {
                    transcribedText = result.text
                }
            }
            
            // Wait before next transcription
            try await Task.sleep(nanoseconds: 1_000_000_000) // 1 second
            
        } catch {
            print("Transcription error: \(error)")
        }
    }
}

Advanced Streaming Features

Voice Activity Detection (VAD)

Optimize streaming by detecting when speech is present:
// Configure VAD settings
var decodingOptions = DecodingOptions()
decodingOptions.silenceThreshold = 0.3
decodingOptions.useVAD = true

let result = try await whisperKit.transcribe(
    audioArray: audioBuffer,
    decodeOptions: decodingOptions
)

Eager Decoding Mode

Get even faster updates with eager streaming mode:
var decodingOptions = DecodingOptions()
decodingOptions.enableEagerDecoding = true
decodingOptions.tokenConfirmationsNeeded = 2

let result = try await whisperKit.transcribe(
    audioArray: audioBuffer,
    decodeOptions: decodingOptions
)
Eager mode provides faster updates but may produce less accurate intermediate results.

Segment Confirmation

Track confirmed vs. unconfirmed segments:
if let result = try await whisperKit.transcribe(audioArray: audioBuffer) {
    // Confirmed segments (finalized)
    for segment in result.confirmedSegments {
        print("✓ [\(segment.start)s]: \(segment.text)")
    }
    
    // Unconfirmed segments (still being processed)
    for segment in result.unconfirmedSegments {
        print("⋯ [\(segment.start)s]: \(segment.text)")
    }
}

Complete Streaming Example

Here’s a full SwiftUI example with visualization:
import SwiftUI
import WhisperKit

struct StreamingView: View {
    @StateObject private var viewModel = StreamingViewModel()
    
    var body: some View {
        VStack {
            // Audio waveform visualization
            WaveformView(energyLevels: viewModel.bufferEnergy)
                .frame(height: 100)
            
            // Transcription display
            ScrollView {
                VStack(alignment: .leading, spacing: 8) {
                    // Confirmed text (bold)
                    Text(viewModel.confirmedText)
                        .fontWeight(.bold)
                    
                    // Hypothesis text (gray)
                    Text(viewModel.hypothesisText)
                        .foregroundColor(.gray)
                }
                .padding()
            }
            
            // Controls
            HStack {
                Button(action: {
                    if viewModel.isRecording {
                        viewModel.stopRecording()
                    } else {
                        viewModel.startRecording()
                    }
                }) {
                    Image(systemName: viewModel.isRecording ? "stop.circle.fill" : "record.circle")
                        .resizable()
                        .frame(width: 60, height: 60)
                        .foregroundColor(.red)
                }
                
                Text(String(format: "%.1fs", viewModel.recordingDuration))
                    .font(.caption)
            }
            .padding()
        }
        .task {
            await viewModel.loadModel()
        }
    }
}

@MainActor
class StreamingViewModel: ObservableObject {
    @Published var confirmedText = ""
    @Published var hypothesisText = ""
    @Published var isRecording = false
    @Published var bufferEnergy: [Float] = []
    @Published var recordingDuration: TimeInterval = 0
    
    private var whisperKit: WhisperKit?
    private var recordingTask: Task<Void, Never>?
    
    func loadModel() async {
        do {
            whisperKit = try await WhisperKit(
                WhisperKitConfig(model: "large-v3")
            )
        } catch {
            print("Model load error: \(error)")
        }
    }
    
    func startRecording() {
        guard let whisperKit = whisperKit else { return }
        
        recordingTask = Task {
            do {
                try whisperKit.audioProcessor.startRecording()
                isRecording = true
                await transcriptionLoop()
            } catch {
                print("Recording error: \(error)")
            }
        }
    }
    
    func stopRecording() {
        recordingTask?.cancel()
        whisperKit?.audioProcessor.stopRecording()
        isRecording = false
        recordingDuration = 0
    }
    
    private func transcriptionLoop() async {
        guard let whisperKit = whisperKit else { return }
        
        var confirmedWords: [WordTiming] = []
        var startTime = Date()
        
        while !Task.isCancelled && isRecording {
            // Update duration
            recordingDuration = Date().timeIntervalSince(startTime)
            
            // Get audio buffer and energy levels
            if let audioSamples = whisperKit.audioProcessor.audioSamples {
                bufferEnergy = whisperKit.audioProcessor.relativeEnergy ?? []
                
                // Transcribe
                if let result = try? await whisperKit.transcribe(
                    audioArray: audioSamples
                ) {
                    // Update confirmed and hypothesis text
                    if let words = result.allWords {
                        let newConfirmed = words.filter { $0.probability > 0.5 }
                        confirmedWords = newConfirmed
                        confirmedText = newConfirmed.map { $0.word }.joined()
                        
                        let hypothesis = words.filter { $0.probability <= 0.5 }
                        hypothesisText = hypothesis.map { $0.word }.joined()
                    }
                }
            }
            
            // Wait before next iteration
            try? await Task.sleep(for: .seconds(1))
        }
    }
}

Audio Device Selection (macOS)

On macOS, you can select which audio input device to use:
import WhisperKit

// Get available audio devices
let devices = AudioProcessor.getAudioDevices()

for device in devices {
    print("Device: \(device.name)")
}

// Start recording from a specific device
let selectedDevice = devices.first { $0.name.contains("Microphone") }
if let device = selectedDevice {
    try whisperKit.audioProcessor.startRecording(
        inputDeviceID: device.id
    )
}

Performance Tips

Set computeOptions.audioEncoderCompute and textDecoderCompute to .cpuAndNeuralEngine for best performance.
Tune the audio buffer size and chunking strategy for your use case:
decodingOptions.chunkingStrategy = .vad
decodingOptions.sampleLength = 224
Lower fallbackCount to reduce latency at the cost of accuracy:
decodingOptions.temperatureFallbackCount = 3

Next Steps

Basic Transcription

Learn the basics of file-based transcription

Local Server

Set up a server for non-Swift clients

Build docs developers (and LLMs) love