Configuration
WhisperKit provides extensive configuration options through WhisperKitConfig and ModelComputeOptions to customize model loading, computation, and transcription behavior.
WhisperKitConfig
The main configuration class for initializing WhisperKit:
public class WhisperKitConfig {
// Model configuration
var model: String ?
var downloadBase: URL ?
var modelRepo: String ?
var modelToken: String ?
var modelEndpoint: String ?
var modelFolder: String ?
var tokenizerFolder: URL ?
// Compute configuration
var computeOptions: ModelComputeOptions ?
var audioInputConfig: AudioInputConfig ?
// Component customization
var audioProcessor: ( any AudioProcessing) ?
var featureExtractor: ( any FeatureExtracting) ?
var audioEncoder: ( any AudioEncoding) ?
var textDecoder: ( any TextDecoding) ?
var logitsFilters: [ any LogitsFiltering] ?
var segmentSeeker: ( any SegmentSeeking) ?
var voiceActivityDetector: VoiceActivityDetector ?
// Behavior options
var verbose: Bool
var logLevel: Logging.LogLevel
var prewarm: Bool ?
var load: Bool ?
var download: Bool
var useBackgroundDownloadSession: Bool
}
See WhisperKitConfig
Basic Configuration
Simple Initialization
// Use defaults
let whisperKit = try await WhisperKit ()
// Specify model
let whisperKit = try await WhisperKit ( model : "base" )
// With configuration object
let config = WhisperKitConfig (
model : "small" ,
verbose : true ,
download : true
)
let whisperKit = try await WhisperKit (config)
Convenience Initializer
let whisperKit = try await WhisperKit (
model : "base" ,
computeOptions : ModelComputeOptions (),
verbose : true ,
prewarm : true ,
load : true
)
See WhisperKit.init
Model Configuration
Model Selection
Model variant name (e.g., “tiny”, “base”, “small”, “medium”, “large-v3”). If nil, uses the recommended model for the device.
let config = WhisperKitConfig (
model : "large-v3" // Specify model
)
Model Repository
modelRepo
String?
default: "argmaxinc/whisperkit-coreml"
Hugging Face repository containing the models.
Authentication token for private repositories.
modelEndpoint
String?
default: "https://huggingface.co"
Custom Hugging Face Hub endpoint URL.
let config = WhisperKitConfig (
model : "base" ,
modelRepo : "my-org/custom-whisper-models" ,
modelToken : "hf_..." ,
modelEndpoint : "https://custom-hub.example.com"
)
Local Models
Path to a local model folder. If set, download is typically set to false.
Path to tokenizer files. If nil, searches model folder and downloads if needed.
let config = WhisperKitConfig (
modelFolder : "/path/to/models/openai_whisper-base" ,
tokenizerFolder : URL ( fileURLWithPath : "/path/to/tokenizers" ),
download : false
)
Download Configuration
Base directory for downloading models. If nil, uses default cache location.
Whether to download models if not available locally.
useBackgroundDownloadSession
Use background URL session for downloads (survives app suspension).
let documentsURL = FileManager. default . urls (
for : . documentDirectory ,
in : . userDomainMask
). first !
let config = WhisperKitConfig (
model : "medium" ,
downloadBase : documentsURL. appendingPathComponent ( "Models" ),
download : true ,
useBackgroundDownloadSession : true // For large models
)
ModelComputeOptions
Control which compute units (CPU, GPU, Neural Engine) are used for each model component:
public struct ModelComputeOptions {
var melCompute: MLComputeUnits
var audioEncoderCompute: MLComputeUnits
var textDecoderCompute: MLComputeUnits
var prefillCompute: MLComputeUnits
}
See ModelComputeOptions
Compute Unit Options
.cpuOnly - CPU only
.cpuAndGPU - CPU and GPU
.cpuAndNeuralEngine - CPU and Neural Engine
.all - All available units
Default Compute Configuration
let computeOptions = ModelComputeOptions (
melCompute : . cpuAndGPU , // Mel spectrogram on GPU
audioEncoderCompute : . cpuAndNeuralEngine , // Encoder on Neural Engine (iOS 17+)
textDecoderCompute : . cpuAndNeuralEngine , // Decoder on Neural Engine
prefillCompute : . cpuOnly // Prefill on CPU
)
let config = WhisperKitConfig (
model : "base" ,
computeOptions : computeOptions
)
Optimizing for Different Devices
// For older devices (pre-A16)
let computeOptions = ModelComputeOptions (
audioEncoderCompute : . cpuAndGPU , // GPU fallback
textDecoderCompute : . cpuAndNeuralEngine
)
// For maximum performance (A17 Pro+)
let computeOptions = ModelComputeOptions (
audioEncoderCompute : . cpuAndNeuralEngine ,
textDecoderCompute : . cpuAndNeuralEngine
)
// For debugging (slowest, most predictable)
let computeOptions = ModelComputeOptions (
melCompute : . cpuOnly ,
audioEncoderCompute : . cpuOnly ,
textDecoderCompute : . cpuOnly ,
prefillCompute : . cpuOnly
)
Loading Behavior
Prewarm
Load and unload models sequentially to trigger Core ML specialization with lower peak memory usage.
let config = WhisperKitConfig (
model : "large-v3" ,
prewarm : true // Reduces peak memory at cost of 2x load time
)
See WhisperKitConfig.prewarm
When to use prewarm:
Loading large models (medium, large)
Memory-constrained devices
First launch after OS update (triggers Core ML compilation)
Trade-offs:
Doubles model load time (usually less than 1s overhead when cached)
Significantly reduces peak memory usage
Load
Whether to load models immediately. If nil, loads if modelFolder is provided.
// Download but don't load yet
let config = WhisperKitConfig (
model : "base" ,
load : false // Defer loading
)
let whisperKit = try await WhisperKit (config)
// Load later
try await whisperKit. loadModels ()
Logging
Verbosity
Enable detailed logging output.
logLevel
Logging.LogLevel
default: ".info"
Maximum log level to display: .debug, .info, .error, .none.
let config = WhisperKitConfig (
verbose : true ,
logLevel : . debug // Show all logs
)
let whisperKit = try await WhisperKit (config)
// Custom logging callback
whisperKit. loggingCallback { level, message in
print ( "[ \( level ) ] \( message ) " )
// Send to analytics, file, etc.
}
See WhisperKit.loggingCallback
Custom Components
Replace default components with custom implementations:
Audio Processor
audioProcessor
AudioProcessing?
default: "AudioProcessor()"
Custom audio processing implementation.
class CustomAudioProcessor : AudioProcessing {
// Custom implementation
}
let config = WhisperKitConfig (
audioProcessor : CustomAudioProcessor ()
)
Custom mel spectrogram feature extraction.
Audio Encoder
audioEncoder
AudioEncoding?
default: "AudioEncoder()"
Custom audio encoding implementation.
Text Decoder
textDecoder
TextDecoding?
default: "TextDecoder()"
Custom text decoding implementation.
Logits Filters
logitsFilters
[LogitsFiltering]?
default: "nil"
Custom filters to modify logits before sampling.
class CustomLogitsFilter : LogitsFiltering {
func filterLogits ( _ logits : MLMultiArray, withTokens tokens : [ Int ]) -> MLMultiArray {
// Custom filtering logic
return logits
}
}
let config = WhisperKitConfig (
logitsFilters : [ CustomLogitsFilter ()]
)
Segment Seeker
segmentSeeker
SegmentSeeking?
default: "SegmentSeeker()"
Custom segment seeking logic for timestamp alignment.
Voice Activity Detector
voiceActivityDetector
VoiceActivityDetector?
default: "nil"
VAD implementation for audio chunking.
let vad = EnergyVAD ( energyThreshold : 0.02 )
let config = WhisperKitConfig (
voiceActivityDetector : vad
)
audioInputConfig
AudioInputConfig?
default: "AudioInputConfig()"
Configuration for audio input processing.
public struct AudioInputConfig {
var channelMode: ChannelMode // .mono, .stereo, etc.
// Additional audio configuration
}
let audioConfig = AudioInputConfig (
channelMode : . mono
)
let config = WhisperKitConfig (
audioInputConfig : audioConfig
)
See WhisperKitConfig.audioInputConfig
State Callbacks
Monitor model state changes and transcription progress:
Model State Callback
let whisperKit = try await WhisperKit ()
whisperKit. modelStateCallback = { oldState, newState in
print ( "Model state: \( oldState ) -> \( newState ) " )
switch newState {
case . loading :
showLoadingIndicator ()
case . loaded :
hideLoadingIndicator ()
case . unloaded :
clearMemory ()
default :
break
}
}
See WhisperKit.modelStateCallback
Transcription State Callback
whisperKit. transcriptionStateCallback = { state in
switch state {
case . convertingAudio :
print ( "Converting audio..." )
case . transcribing :
print ( "Transcribing..." )
case . finished :
print ( "Done!" )
}
}
See WhisperKit.transcriptionStateCallback
Segment Discovery Callback
whisperKit. segmentDiscoveryCallback = { segments in
for segment in segments {
print ( "[ \( segment. start ) s - \( segment. end ) s]: \( segment. text ) " )
// Update UI in real-time
}
}
See WhisperKit.segmentDiscoveryCallback
Complete Configuration Example
import WhisperKit
func setupWhisperKit () async throws -> WhisperKit {
// Custom VAD
let vad = EnergyVAD (
frameLength : 0.1 ,
energyThreshold : 0.02
)
// Compute options
let computeOptions = ModelComputeOptions (
melCompute : . cpuAndGPU ,
audioEncoderCompute : . cpuAndNeuralEngine ,
textDecoderCompute : . cpuAndNeuralEngine ,
prefillCompute : . cpuOnly
)
// Audio configuration
let audioConfig = AudioInputConfig (
channelMode : . mono
)
// Main configuration
let config = WhisperKitConfig (
model : "base" ,
modelRepo : "argmaxinc/whisperkit-coreml" ,
computeOptions : computeOptions,
audioInputConfig : audioConfig,
voiceActivityDetector : vad,
verbose : true ,
logLevel : . info ,
prewarm : true ,
load : true ,
download : true ,
useBackgroundDownloadSession : false
)
// Initialize
let whisperKit = try await WhisperKit (config)
// Setup callbacks
whisperKit. modelStateCallback = { oldState, newState in
print ( "Model: \( oldState ) -> \( newState ) " )
}
whisperKit. transcriptionStateCallback = { state in
print ( "Transcription: \( state ) " )
}
whisperKit. segmentDiscoveryCallback = { segments in
for segment in segments {
print ( "Segment: \( segment. text ) " )
}
}
whisperKit. loggingCallback { level, message in
// Custom logging
if level == . error {
sendToErrorTracking (message)
}
}
return whisperKit
}
// Usage
let whisperKit = try await setupWhisperKit ()
// Transcribe
var options = DecodingOptions (
verbose : true ,
language : "en" ,
wordTimestamps : true ,
chunkingStrategy : . vad
)
let results = try await whisperKit. transcribe (
audioPath : "audio.wav" ,
decodeOptions : options
)
Best Practices
Memory Management Use prewarm: true for large models on memory-constrained devices. Call unloadModels() when not in use.
Compute Units Use Neural Engine for iOS 17+ devices. Fall back to GPU for older devices.
Logging Enable verbose logging during development. Disable in production for performance.
Background Downloads Use background sessions for large models (medium, large) to prevent interruption.
For Real-time Streaming
let config = WhisperKitConfig (
model : "tiny" , // Fastest model
computeOptions : ModelComputeOptions (
audioEncoderCompute : . cpuAndNeuralEngine ,
textDecoderCompute : . cpuAndNeuralEngine
),
voiceActivityDetector : EnergyVAD ( energyThreshold : 0.02 ),
prewarm : false , // Skip prewarm for faster init
load : true
)
For Maximum Accuracy
let config = WhisperKitConfig (
model : "large-v3" , // Most accurate
computeOptions : ModelComputeOptions (
audioEncoderCompute : . cpuAndNeuralEngine ,
textDecoderCompute : . cpuAndNeuralEngine
),
prewarm : true , // Manage memory for large model
load : true
)
For Minimal Memory
let config = WhisperKitConfig (
model : "tiny" ,
prewarm : true ,
load : false // Load on-demand
)
let whisperKit = try await WhisperKit (config)
// Load only when needed
try await whisperKit. loadModels ()
// Unload after use
await whisperKit. unloadModels ()
Next Steps
Transcription Start transcribing audio
Streaming Real-time audio transcription