Skip to main content

Configuration

WhisperKit provides extensive configuration options through WhisperKitConfig and ModelComputeOptions to customize model loading, computation, and transcription behavior.

WhisperKitConfig

The main configuration class for initializing WhisperKit:
public class WhisperKitConfig {
    // Model configuration
    var model: String?
    var downloadBase: URL?
    var modelRepo: String?
    var modelToken: String?
    var modelEndpoint: String?
    var modelFolder: String?
    var tokenizerFolder: URL?
    
    // Compute configuration
    var computeOptions: ModelComputeOptions?
    var audioInputConfig: AudioInputConfig?
    
    // Component customization
    var audioProcessor: (any AudioProcessing)?
    var featureExtractor: (any FeatureExtracting)?
    var audioEncoder: (any AudioEncoding)?
    var textDecoder: (any TextDecoding)?
    var logitsFilters: [any LogitsFiltering]?
    var segmentSeeker: (any SegmentSeeking)?
    var voiceActivityDetector: VoiceActivityDetector?
    
    // Behavior options
    var verbose: Bool
    var logLevel: Logging.LogLevel
    var prewarm: Bool?
    var load: Bool?
    var download: Bool
    var useBackgroundDownloadSession: Bool
}
See WhisperKitConfig

Basic Configuration

Simple Initialization

// Use defaults
let whisperKit = try await WhisperKit()

// Specify model
let whisperKit = try await WhisperKit(model: "base")

// With configuration object
let config = WhisperKitConfig(
    model: "small",
    verbose: true,
    download: true
)
let whisperKit = try await WhisperKit(config)

Convenience Initializer

let whisperKit = try await WhisperKit(
    model: "base",
    computeOptions: ModelComputeOptions(),
    verbose: true,
    prewarm: true,
    load: true
)
See WhisperKit.init

Model Configuration

Model Selection

model
String?
default:"nil"
Model variant name (e.g., “tiny”, “base”, “small”, “medium”, “large-v3”). If nil, uses the recommended model for the device.
let config = WhisperKitConfig(
    model: "large-v3"  // Specify model
)

Model Repository

modelRepo
String?
default:"argmaxinc/whisperkit-coreml"
Hugging Face repository containing the models.
modelToken
String?
default:"nil"
Authentication token for private repositories.
modelEndpoint
String?
default:"https://huggingface.co"
Custom Hugging Face Hub endpoint URL.
let config = WhisperKitConfig(
    model: "base",
    modelRepo: "my-org/custom-whisper-models",
    modelToken: "hf_...",
    modelEndpoint: "https://custom-hub.example.com"
)

Local Models

modelFolder
String?
default:"nil"
Path to a local model folder. If set, download is typically set to false.
tokenizerFolder
URL?
default:"nil"
Path to tokenizer files. If nil, searches model folder and downloads if needed.
let config = WhisperKitConfig(
    modelFolder: "/path/to/models/openai_whisper-base",
    tokenizerFolder: URL(fileURLWithPath: "/path/to/tokenizers"),
    download: false
)

Download Configuration

downloadBase
URL?
default:"nil"
Base directory for downloading models. If nil, uses default cache location.
download
Bool
default:"true"
Whether to download models if not available locally.
useBackgroundDownloadSession
Bool
default:"false"
Use background URL session for downloads (survives app suspension).
let documentsURL = FileManager.default.urls(
    for: .documentDirectory,
    in: .userDomainMask
).first!

let config = WhisperKitConfig(
    model: "medium",
    downloadBase: documentsURL.appendingPathComponent("Models"),
    download: true,
    useBackgroundDownloadSession: true  // For large models
)

ModelComputeOptions

Control which compute units (CPU, GPU, Neural Engine) are used for each model component:
public struct ModelComputeOptions {
    var melCompute: MLComputeUnits
    var audioEncoderCompute: MLComputeUnits
    var textDecoderCompute: MLComputeUnits
    var prefillCompute: MLComputeUnits
}
See ModelComputeOptions

Compute Unit Options

  • .cpuOnly - CPU only
  • .cpuAndGPU - CPU and GPU
  • .cpuAndNeuralEngine - CPU and Neural Engine
  • .all - All available units

Default Compute Configuration

let computeOptions = ModelComputeOptions(
    melCompute: .cpuAndGPU,              // Mel spectrogram on GPU
    audioEncoderCompute: .cpuAndNeuralEngine,  // Encoder on Neural Engine (iOS 17+)
    textDecoderCompute: .cpuAndNeuralEngine,   // Decoder on Neural Engine
    prefillCompute: .cpuOnly             // Prefill on CPU
)

let config = WhisperKitConfig(
    model: "base",
    computeOptions: computeOptions
)

Optimizing for Different Devices

// For older devices (pre-A16)
let computeOptions = ModelComputeOptions(
    audioEncoderCompute: .cpuAndGPU,  // GPU fallback
    textDecoderCompute: .cpuAndNeuralEngine
)

// For maximum performance (A17 Pro+)
let computeOptions = ModelComputeOptions(
    audioEncoderCompute: .cpuAndNeuralEngine,
    textDecoderCompute: .cpuAndNeuralEngine
)

// For debugging (slowest, most predictable)
let computeOptions = ModelComputeOptions(
    melCompute: .cpuOnly,
    audioEncoderCompute: .cpuOnly,
    textDecoderCompute: .cpuOnly,
    prefillCompute: .cpuOnly
)

Loading Behavior

Prewarm

prewarm
Bool?
default:"nil"
Load and unload models sequentially to trigger Core ML specialization with lower peak memory usage.
let config = WhisperKitConfig(
    model: "large-v3",
    prewarm: true  // Reduces peak memory at cost of 2x load time
)
See WhisperKitConfig.prewarm
When to use prewarm:
  • Loading large models (medium, large)
  • Memory-constrained devices
  • First launch after OS update (triggers Core ML compilation)
Trade-offs:
  • Doubles model load time (usually less than 1s overhead when cached)
  • Significantly reduces peak memory usage

Load

load
Bool?
default:"nil"
Whether to load models immediately. If nil, loads if modelFolder is provided.
// Download but don't load yet
let config = WhisperKitConfig(
    model: "base",
    load: false  // Defer loading
)

let whisperKit = try await WhisperKit(config)

// Load later
try await whisperKit.loadModels()

Logging

Verbosity

verbose
Bool
default:"true"
Enable detailed logging output.
logLevel
Logging.LogLevel
default:".info"
Maximum log level to display: .debug, .info, .error, .none.
let config = WhisperKitConfig(
    verbose: true,
    logLevel: .debug  // Show all logs
)

let whisperKit = try await WhisperKit(config)

// Custom logging callback
whisperKit.loggingCallback { level, message in
    print("[\(level)] \(message)")
    // Send to analytics, file, etc.
}
See WhisperKit.loggingCallback

Custom Components

Replace default components with custom implementations:

Audio Processor

audioProcessor
AudioProcessing?
default:"AudioProcessor()"
Custom audio processing implementation.
class CustomAudioProcessor: AudioProcessing {
    // Custom implementation
}

let config = WhisperKitConfig(
    audioProcessor: CustomAudioProcessor()
)

Feature Extractor

featureExtractor
FeatureExtracting?
default:"FeatureExtractor()"
Custom mel spectrogram feature extraction.

Audio Encoder

audioEncoder
AudioEncoding?
default:"AudioEncoder()"
Custom audio encoding implementation.

Text Decoder

textDecoder
TextDecoding?
default:"TextDecoder()"
Custom text decoding implementation.

Logits Filters

logitsFilters
[LogitsFiltering]?
default:"nil"
Custom filters to modify logits before sampling.
class CustomLogitsFilter: LogitsFiltering {
    func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray {
        // Custom filtering logic
        return logits
    }
}

let config = WhisperKitConfig(
    logitsFilters: [CustomLogitsFilter()]
)

Segment Seeker

segmentSeeker
SegmentSeeking?
default:"SegmentSeeker()"
Custom segment seeking logic for timestamp alignment.

Voice Activity Detector

voiceActivityDetector
VoiceActivityDetector?
default:"nil"
VAD implementation for audio chunking.
let vad = EnergyVAD(energyThreshold: 0.02)

let config = WhisperKitConfig(
    voiceActivityDetector: vad
)

Audio Input Configuration

audioInputConfig
AudioInputConfig?
default:"AudioInputConfig()"
Configuration for audio input processing.
public struct AudioInputConfig {
    var channelMode: ChannelMode  // .mono, .stereo, etc.
    // Additional audio configuration
}

let audioConfig = AudioInputConfig(
    channelMode: .mono
)

let config = WhisperKitConfig(
    audioInputConfig: audioConfig
)
See WhisperKitConfig.audioInputConfig

State Callbacks

Monitor model state changes and transcription progress:

Model State Callback

let whisperKit = try await WhisperKit()

whisperKit.modelStateCallback = { oldState, newState in
    print("Model state: \(oldState) -> \(newState)")
    
    switch newState {
    case .loading:
        showLoadingIndicator()
    case .loaded:
        hideLoadingIndicator()
    case .unloaded:
        clearMemory()
    default:
        break
    }
}
See WhisperKit.modelStateCallback

Transcription State Callback

whisperKit.transcriptionStateCallback = { state in
    switch state {
    case .convertingAudio:
        print("Converting audio...")
    case .transcribing:
        print("Transcribing...")
    case .finished:
        print("Done!")
    }
}
See WhisperKit.transcriptionStateCallback

Segment Discovery Callback

whisperKit.segmentDiscoveryCallback = { segments in
    for segment in segments {
        print("[\(segment.start)s - \(segment.end)s]: \(segment.text)")
        // Update UI in real-time
    }
}
See WhisperKit.segmentDiscoveryCallback

Complete Configuration Example

import WhisperKit

func setupWhisperKit() async throws -> WhisperKit {
    // Custom VAD
    let vad = EnergyVAD(
        frameLength: 0.1,
        energyThreshold: 0.02
    )
    
    // Compute options
    let computeOptions = ModelComputeOptions(
        melCompute: .cpuAndGPU,
        audioEncoderCompute: .cpuAndNeuralEngine,
        textDecoderCompute: .cpuAndNeuralEngine,
        prefillCompute: .cpuOnly
    )
    
    // Audio configuration
    let audioConfig = AudioInputConfig(
        channelMode: .mono
    )
    
    // Main configuration
    let config = WhisperKitConfig(
        model: "base",
        modelRepo: "argmaxinc/whisperkit-coreml",
        computeOptions: computeOptions,
        audioInputConfig: audioConfig,
        voiceActivityDetector: vad,
        verbose: true,
        logLevel: .info,
        prewarm: true,
        load: true,
        download: true,
        useBackgroundDownloadSession: false
    )
    
    // Initialize
    let whisperKit = try await WhisperKit(config)
    
    // Setup callbacks
    whisperKit.modelStateCallback = { oldState, newState in
        print("Model: \(oldState) -> \(newState)")
    }
    
    whisperKit.transcriptionStateCallback = { state in
        print("Transcription: \(state)")
    }
    
    whisperKit.segmentDiscoveryCallback = { segments in
        for segment in segments {
            print("Segment: \(segment.text)")
        }
    }
    
    whisperKit.loggingCallback { level, message in
        // Custom logging
        if level == .error {
            sendToErrorTracking(message)
        }
    }
    
    return whisperKit
}

// Usage
let whisperKit = try await setupWhisperKit()

// Transcribe
var options = DecodingOptions(
    verbose: true,
    language: "en",
    wordTimestamps: true,
    chunkingStrategy: .vad
)

let results = try await whisperKit.transcribe(
    audioPath: "audio.wav",
    decodeOptions: options
)

Best Practices

Memory Management

Use prewarm: true for large models on memory-constrained devices. Call unloadModels() when not in use.

Compute Units

Use Neural Engine for iOS 17+ devices. Fall back to GPU for older devices.

Logging

Enable verbose logging during development. Disable in production for performance.

Background Downloads

Use background sessions for large models (medium, large) to prevent interruption.

Performance Tuning

For Real-time Streaming

let config = WhisperKitConfig(
    model: "tiny",  // Fastest model
    computeOptions: ModelComputeOptions(
        audioEncoderCompute: .cpuAndNeuralEngine,
        textDecoderCompute: .cpuAndNeuralEngine
    ),
    voiceActivityDetector: EnergyVAD(energyThreshold: 0.02),
    prewarm: false,  // Skip prewarm for faster init
    load: true
)

For Maximum Accuracy

let config = WhisperKitConfig(
    model: "large-v3",  // Most accurate
    computeOptions: ModelComputeOptions(
        audioEncoderCompute: .cpuAndNeuralEngine,
        textDecoderCompute: .cpuAndNeuralEngine
    ),
    prewarm: true,  // Manage memory for large model
    load: true
)

For Minimal Memory

let config = WhisperKitConfig(
    model: "tiny",
    prewarm: true,
    load: false  // Load on-demand
)

let whisperKit = try await WhisperKit(config)

// Load only when needed
try await whisperKit.loadModels()

// Unload after use
await whisperKit.unloadModels()

Next Steps

Transcription

Start transcribing audio

Streaming

Real-time audio transcription

Build docs developers (and LLMs) love