Skip to main content

Overview

The WhisperKit class is the main entry point for performing speech-to-text transcription using Apple’s Core ML framework. It manages model loading, audio processing, and provides both synchronous and asynchronous transcription methods.

Class Definition

open class WhisperKit

Initializers

init(_:)

Initializes WhisperKit with a configuration object.
public init(_ config: WhisperKitConfig = WhisperKitConfig()) async throws
config
WhisperKitConfig
Configuration object for WhisperKit initialization. See WhisperKitConfig for details.
Throws: An error if model setup or loading fails.

Convenience Initializer

Initializes WhisperKit with individual parameters.
public convenience init(
    model: String? = nil,
    downloadBase: URL? = nil,
    modelRepo: String? = nil,
    modelFolder: String? = nil,
    tokenizerFolder: URL? = nil,
    computeOptions: ModelComputeOptions? = nil,
    audioProcessor: (any AudioProcessing)? = nil,
    featureExtractor: (any FeatureExtracting)? = nil,
    audioEncoder: (any AudioEncoding)? = nil,
    textDecoder: (any TextDecoding)? = nil,
    logitsFilters: [any LogitsFiltering]? = nil,
    segmentSeeker: (any SegmentSeeking)? = nil,
    verbose: Bool = true,
    logLevel: Logging.LogLevel = .info,
    prewarm: Bool? = nil,
    load: Bool? = nil,
    download: Bool = true,
    useBackgroundDownloadSession: Bool = false
) async throws
model
String?
Name of the Whisper model variant to use (e.g., “tiny”, “base”, “small”, “medium”, “large”)
downloadBase
URL?
Base URL for downloading models
modelRepo
String?
Repository name for downloading models (default: “argmaxinc/whisperkit-coreml”)
modelFolder
String?
Local folder path containing pre-downloaded models
tokenizerFolder
URL?
Folder containing tokenizer files
computeOptions
ModelComputeOptions?
Options for ML compute units (CPU, GPU, Neural Engine)
audioProcessor
AudioProcessing?
Custom audio processor implementation
featureExtractor
FeatureExtracting?
Custom feature extractor implementation
audioEncoder
AudioEncoding?
Custom audio encoder implementation
textDecoder
TextDecoding?
Custom text decoder implementation
logitsFilters
[LogitsFiltering]?
Array of logits filters to apply during decoding
segmentSeeker
SegmentSeeking?
Custom segment seeker implementation
verbose
Bool
default:"true"
Enable verbose logging
logLevel
Logging.LogLevel
default:".info"
Maximum log level to display
prewarm
Bool?
Enable model prewarming to reduce peak memory during initialization
load
Bool?
Whether to load models immediately
download
Bool
default:"true"
Download models if not available locally
useBackgroundDownloadSession
Bool
default:"false"
Use background download session for model downloads

Properties

Model State

modelVariant
ModelVariant
Currently loaded model variant (tiny, base, small, medium, large, etc.)
modelState
ModelState
Current state of the model (unloaded, loading, loaded, prewarming, etc.)
modelCompute
ModelComputeOptions
Compute options for the loaded models
tokenizer
WhisperTokenizer?
The tokenizer used for encoding/decoding text

Processing Components

audioProcessor
AudioProcessing
Audio processor for handling audio input and preprocessing
featureExtractor
FeatureExtracting
Feature extractor for converting audio to mel spectrograms
audioEncoder
AudioEncoding
Audio encoder for encoding mel spectrograms to embeddings
textDecoder
TextDecoding
Text decoder for generating text from audio embeddings
segmentSeeker
SegmentSeeking
Segment seeker for managing audio window processing
voiceActivityDetector
VoiceActivityDetector?
Optional voice activity detector for chunking audio

Configuration

audioInputConfig
AudioInputConfig
Configuration for audio input processing
modelFolder
URL?
Path to the folder containing model files
tokenizerFolder
URL?
Path to the folder containing tokenizer files

Progress and Callbacks

currentTimings
TranscriptionTimings
Timing information for the current/last transcription
progress
Progress
Progress object for tracking transcription progress
segmentDiscoveryCallback
SegmentDiscoveryCallback?
Callback invoked when new transcription segments are discovered
modelStateCallback
ModelStateCallback?
Callback invoked when model state changes
transcriptionStateCallback
TranscriptionStateCallback?
Callback invoked when transcription state changes

Constants

sampleRate
Int
default:"16000"
Sample rate used for audio processing (16 kHz)
hopLength
Int
default:"160"
Hop length for mel spectrogram computation
secondsPerTimeToken
Float
default:"0.02"
Duration in seconds represented by each time token (20ms)

Static Methods

deviceName()

Returns the device identifier string.
public static func deviceName() -> String
return
String
Device identifier (e.g., “iPhone15,2”)

recommendedModels()

Returns recommended models for the current device.
public static func recommendedModels() -> ModelSupport
return
ModelSupport
Model support information including default and supported model variants

recommendedRemoteModels(from:downloadBase:token:remoteConfigName:endpoint:)

Fetches recommended models from a remote repository.
public static func recommendedRemoteModels(
    from repo: String = "argmaxinc/whisperkit-coreml",
    downloadBase: URL? = nil,
    token: String? = nil,
    remoteConfigName: String = Constants.defaultRemoteConfigName,
    endpoint: String = Constants.defaultRemoteEndpoint
) async -> ModelSupport
repo
String
default:"argmaxinc/whisperkit-coreml"
Repository to fetch model configuration from
downloadBase
URL?
Base URL for downloads
token
String?
Authentication token for the repository
remoteConfigName
String
Name of the remote configuration file
endpoint
String
API endpoint for the repository
return
ModelSupport
Model support information from the remote repository

fetchAvailableModels(from:matching:downloadBase:token:remoteConfigName:endpoint:)

Fetches list of available models from a remote repository.
public static func fetchAvailableModels(
    from repo: String = "argmaxinc/whisperkit-coreml",
    matching: [String] = ["*"],
    downloadBase: URL? = nil,
    token: String? = nil,
    remoteConfigName: String = Constants.defaultRemoteConfigName,
    endpoint: String = Constants.defaultRemoteEndpoint
) async throws -> [String]
repo
String
default:"argmaxinc/whisperkit-coreml"
Repository to fetch models from
matching
[String]
default:"[\"*\"]"
Glob patterns to filter model names
return
[String]
Array of available model names

download(variant:downloadBase:useBackgroundSession:from:token:endpoint:progressCallback:)

Downloads a specific model variant.
public static func download(
    variant: String,
    downloadBase: URL? = nil,
    useBackgroundSession: Bool = false,
    from repo: String = "argmaxinc/whisperkit-coreml",
    token: String? = nil,
    endpoint: String = Constants.defaultRemoteEndpoint,
    progressCallback: ((Progress) -> Void)? = nil
) async throws -> URL
variant
String
Model variant to download (e.g., “tiny”, “base”, “small”)
progressCallback
((Progress) -> Void)?
Optional callback for download progress updates
return
URL
Local URL of the downloaded model folder

Instance Methods

loadModels(prewarmMode:)

Loads the models into memory.
open func loadModels(prewarmMode: Bool = false) async throws
prewarmMode
Bool
default:"false"
If true, loads models in prewarm mode to reduce peak memory usage

prewarmModels()

Prewarms the models by loading them sequentially.
open func prewarmModels() async throws

unloadModels()

Unloads all models from memory.
open func unloadModels() async

clearState()

Clears the current transcription state.
open func clearState()

detectLanguage(audioPath:)

Detects the language of audio from a file path.
open func detectLanguage(
    audioPath: String
) async throws -> (language: String, langProbs: [String: Float])
audioPath
String
Path to the audio file
return
(language: String, langProbs: [String: Float])
Tuple containing detected language code and probability distribution over all languages

detectLangauge(audioArray:)

Detects the language of audio from sample array.
open func detectLangauge(
    audioArray: [Float]
) async throws -> (language: String, langProbs: [String: Float])
audioArray
[Float]
Array of 16kHz audio samples
return
(language: String, langProbs: [String: Float])
Tuple containing detected language code and probability distribution

transcribe(audioPath:decodeOptions:callback:)

Transcribes audio from a file path.
open func transcribe(
    audioPath: String,
    decodeOptions: DecodingOptions? = nil,
    callback: TranscriptionCallback = nil
) async throws -> [TranscriptionResult]
audioPath
String
Path to the audio file to transcribe
decodeOptions
DecodingOptions?
Options for transcription (language, task, temperature, etc.)
callback
TranscriptionCallback
Optional callback for progress updates during transcription
return
[TranscriptionResult]
Array of transcription results. See TranscriptionResult for details.

transcribe(audioArray:decodeOptions:callback:segmentCallback:)

Transcribes audio from a sample array.
open func transcribe(
    audioArray: [Float],
    decodeOptions: DecodingOptions? = nil,
    callback: TranscriptionCallback = nil,
    segmentCallback: SegmentDiscoveryCallback? = nil
) async throws -> [TranscriptionResult]
audioArray
[Float]
Array of 16kHz mono audio samples
decodeOptions
DecodingOptions?
Options for transcription
callback
TranscriptionCallback
Optional callback for progress updates
segmentCallback
SegmentDiscoveryCallback?
Optional callback invoked when segments are discovered
return
[TranscriptionResult]
Array of transcription results

transcribe(audioPaths:decodeOptions:callback:)

Transcribes multiple audio files.
open func transcribe(
    audioPaths: [String],
    decodeOptions: DecodingOptions? = nil,
    callback: TranscriptionCallback = nil
) async -> [[TranscriptionResult]?]
audioPaths
[String]
Array of audio file paths to transcribe
return
[[TranscriptionResult]?]
Array of optional transcription result arrays (nil if transcription failed for that file)

transcribe(audioArrays:decodeOptions:callback:)

Transcribes multiple audio sample arrays.
open func transcribe(
    audioArrays: [[Float]],
    decodeOptions: DecodingOptions? = nil,
    callback: TranscriptionCallback = nil
) async -> [[TranscriptionResult]?]
audioArrays
[[Float]]
Array of audio sample arrays to transcribe
return
[[TranscriptionResult]?]
Array of optional transcription result arrays

loggingCallback(_:)

Sets a custom logging callback.
open func loggingCallback(_ callback: Logging.LoggingCallback?)
callback
Logging.LoggingCallback?
Custom logging callback function

Example Usage

Basic Transcription

import WhisperKit

// Initialize WhisperKit with default settings
let whisperKit = try await WhisperKit()

// Transcribe an audio file
let results = try await whisperKit.transcribe(audioPath: "/path/to/audio.wav")

// Access the transcribed text
for result in results {
    print(result.text)
}

Custom Configuration

// Initialize with specific model and options
let config = WhisperKitConfig(
    model: "base",
    computeOptions: ModelComputeOptions(
        audioEncoderCompute: .cpuAndGPU,
        textDecoderCompute: .cpuAndNeuralEngine
    ),
    verbose: true,
    load: true
)

let whisperKit = try await WhisperKit(config)

Language Detection

let (language, probabilities) = try await whisperKit.detectLanguage(
    audioPath: "/path/to/audio.wav"
)

print("Detected language: \(language)")
print("Confidence: \(probabilities[language] ?? 0)")

With Progress Callback

let results = try await whisperKit.transcribe(
    audioPath: "/path/to/audio.wav",
    decodeOptions: DecodingOptions(task: .transcribe, language: "en")
) { progress in
    print("Progress: \(progress.text)")
    return true // Continue transcription
}

Build docs developers (and LLMs) love