Overview
The AudioProcessor class handles audio loading, conversion, preprocessing, and live recording for WhisperKit. It provides utilities for converting audio to the required format (16kHz mono) and streaming audio from input devices.
Class Definition
open class AudioProcessor: NSObject, AudioProcessing
Initializer
The AudioProcessor uses the default initializer:
let processor = AudioProcessor()
Properties
Audio State
Current buffer of recorded audio samples (16kHz mono)
The audio engine used for live recording
audioEnergy
[(rel: Float, avg: Float, max: Float, min: Float)]
Array of energy measurements for each audio buffer
Array of relative energy values (0-1) normalized to the quietest buffer
Configuration
Number of past buffers to use for calculating relative energy baseline (~2 seconds at 100ms buffers)
Minimum buffer length in samples (default: 0.1 seconds = 1600 samples at 16kHz)
Callback invoked when a new audio buffer is available
When true, replaces input buffers with silence while maintaining timing
Static Methods
Loading Audio
loadAudio(fromPath:channelMode:startTime:endTime:maxReadFrameSize:)
Loads audio from a file path and converts it to the required format.
public static func loadAudio(
fromPath audioFilePath: String,
channelMode: ChannelMode = .sumChannels(nil),
startTime: Double? = 0,
endTime: Double? = nil,
maxReadFrameSize: AVAudioFrameCount? = nil
) throws -> AVAudioPCMBuffer
channelMode
ChannelMode
default:".sumChannels(nil)"
How to handle multi-channel audio:
.sumChannels(nil) - Mix all channels with peak normalization
.sumChannels([0, 1]) - Mix specific channels
.specificChannel(0) - Use only one channel
Start time in seconds to read from
End time in seconds to read until (nil = end of file)
Maximum frames to read at once (for memory management)
Audio buffer in 16kHz mono float32 format
loadAudioAsFloatArray(fromPath:channelMode:startTime:endTime:)
Loads audio and returns it as a float array.
public static func loadAudioAsFloatArray(
fromPath audioFilePath: String,
channelMode: ChannelMode = .sumChannels(nil),
startTime: Double? = 0,
endTime: Double? = nil
) throws -> [Float]
Array of 16kHz mono audio samples
loadAudio(at:channelMode:)
Loads multiple audio files concurrently.
public static func loadAudio(
at audioPaths: [String],
channelMode: ChannelMode = .sumChannels(nil)
) async -> [Result<[Float], Swift.Error>]
Array of audio file paths
Array of results, one per input file
Resampling
resampleAudio(fromFile:toSampleRate:channelCount:channelMode:frameCount:maxReadFrameSize:)
Resamples audio from a file.
public static func resampleAudio(
fromFile audioFile: AVAudioFile,
toSampleRate sampleRate: Double,
channelCount: AVAudioChannelCount,
channelMode: ChannelMode = .sumChannels(nil),
frameCount: AVAudioFrameCount? = nil,
maxReadFrameSize: AVAudioFrameCount = Constants.defaultAudioReadFrameSize
) -> AVAudioPCMBuffer?
Target sample rate (typically 16000)
Target channel count (typically 1 for mono)
resampleAudio(fromBuffer:toSampleRate:channelCount:)
Resamples an audio buffer.
public static func resampleAudio(
fromBuffer inputBuffer: AVAudioPCMBuffer,
toSampleRate sampleRate: Double,
channelCount: AVAudioChannelCount
) -> AVAudioPCMBuffer?
Channel Conversion
convertToMono(_:mode:)
Converts multi-channel audio to mono.
public static func convertToMono(
_ buffer: AVAudioPCMBuffer,
mode: ChannelMode
) -> AVAudioPCMBuffer?
Input audio buffer (possibly multi-channel)
Energy and Voice Activity
calculateAverageEnergy(of:)
Calculates RMS energy of an audio signal.
public static func calculateAverageEnergy(of signal: [Float]) -> Float
calculateEnergy(of:)
Calculates detailed energy metrics.
public static func calculateEnergy(
of signal: [Float]
) -> (avg: Float, max: Float, min: Float)
return
(avg: Float, max: Float, min: Float)
Tuple containing average (RMS), maximum, and minimum energy values
calculateRelativeEnergy(of:relativeTo:)
Calculates energy relative to a reference baseline.
public static func calculateRelativeEnergy(
of signal: [Float],
relativeTo reference: Float?
) -> Float
Reference energy level (typically the minimum energy in recent buffers)
Normalized energy value from 0 to 1
isVoiceDetected(in:nextBufferInSeconds:silenceThreshold:)
Detects if voice is present in audio.
public static func isVoiceDetected(
in relativeEnergy: [Float],
nextBufferInSeconds: Float,
silenceThreshold: Float
) -> Bool
True if voice is detected above the threshold
calculateNonSilentChunks(in:)
Identifies non-silent segments of audio.
public static func calculateNonSilentChunks(
in signal: [Float]
) -> [(startIndex: Int, endIndex: Int)]
return
[(startIndex: Int, endIndex: Int)]
Array of start/end index pairs for non-silent segments
calculateVoiceActivityInChunks(of:chunkCount:frameLengthSamples:frameOverlapSamples:energyThreshold:)
Calculates voice activity for audio chunks.
public static func calculateVoiceActivityInChunks(
of signal: [Float],
chunkCount: Int,
frameLengthSamples: Int,
frameOverlapSamples: Int = 0,
energyThreshold: Float = 0.022
) -> [Bool]
Energy threshold for detecting speech
Array indicating voice activity for each chunk
Utility Methods
padOrTrimAudio(fromArray:startAt:toLength:saveSegment:)
Pads or trims audio to a specific length.
public static func padOrTrimAudio(
fromArray audioArray: [Float],
startAt startIndex: Int = 0,
toLength frameLength: Int = 480_000,
saveSegment: Bool = false
) -> MLMultiArray?
Starting index in the array
Target length in samples (default is 30 seconds at 16kHz)
Padded/trimmed audio as MLMultiArray for Core ML
convertBufferToArray(buffer:chunkSize:)
Converts AVAudioPCMBuffer to float array.
public static func convertBufferToArray(
buffer: AVAudioPCMBuffer,
chunkSize: Int = 1024
) -> [Float]
requestRecordPermission()
Requests microphone permission from the user.
public static func requestRecordPermission() async -> Bool
True if permission granted
getAudioDevices() (macOS only)
Returns list of available audio input devices.
public static func getAudioDevices() -> [AudioDevice]
Array of available audio input devices
Instance Methods
Recording Control
Starts recording audio from an input device.
public func startRecordingLive(
inputDeviceID: DeviceID? = nil,
callback: (([Float]) -> Void)? = nil
) throws
Device ID to record from (nil = default device). Only used on macOS.
Callback invoked for each audio buffer
Starts recording and returns an async stream.
public func startStreamingRecordingLive(
inputDeviceID: DeviceID? = nil
) -> (AsyncThrowingStream<[Float], Error>, AsyncThrowingStream<[Float], Error>.Continuation)
return
(stream: AsyncThrowingStream, continuation: Continuation)
Tuple containing the audio stream and its continuation for cancellation
pauseRecording()
Pauses recording (can be resumed).
public func pauseRecording()
Resumes recording after pause.
public func resumeRecordingLive(
inputDeviceID: DeviceID? = nil,
callback: (([Float]) -> Void)? = nil
) throws
stopRecording()
Stops recording and releases resources.
public func stopRecording()
Audio Buffer Management
purgeAudioSamples(keepingLast:)
Removes old audio samples, keeping only recent ones.
public func purgeAudioSamples(keepingLast keep: Int)
Number of samples to keep
Enables or disables input suppression (silent buffers).
public func setInputSuppressed(_ isSuppressed: Bool)
If true, replaces input with silence
padOrTrim(fromArray:startAt:toLength:)
Instance method for padding/trimming audio.
open func padOrTrim(
fromArray audioArray: [Float],
startAt startIndex: Int,
toLength frameLength: Int
) -> (any AudioProcessorOutputType)?
Example Usage
Load and Convert Audio
import WhisperKit
// Load audio from file
let audioBuffer = try AudioProcessor.loadAudio(
fromPath: "/path/to/audio.mp3",
channelMode: .sumChannels(nil)
)
// Convert to float array
let audioArray = AudioProcessor.convertBufferToArray(buffer: audioBuffer)
print("Loaded \(audioArray.count) samples")
Live Recording with Callback
let processor = AudioProcessor()
// Request permission first
let granted = await AudioProcessor.requestRecordPermission()
guard granted else {
print("Microphone permission denied")
return
}
// Start recording
try processor.startRecordingLive { audioBuffer in
print("Received buffer: \(audioBuffer.count) samples")
// Process audio buffer
}
// Stop when done
processor.stopRecording()
Streaming with AsyncStream
let processor = AudioProcessor()
let (stream, continuation) = processor.startStreamingRecordingLive()
Task {
do {
for try await audioBuffer in stream {
print("Stream received: \(audioBuffer.count) samples")
// Process audio
}
} catch {
print("Stream error: \(error)")
}
}
// Cancel when done
continuation.finish()
Channel Selection
// Use only left channel
let leftChannel = try AudioProcessor.loadAudioAsFloatArray(
fromPath: "/path/to/stereo.wav",
channelMode: .specificChannel(0)
)
// Mix only specific channels
let mixed = try AudioProcessor.loadAudioAsFloatArray(
fromPath: "/path/to/multi.wav",
channelMode: .sumChannels([0, 2]) // Mix channels 0 and 2
)
Energy-Based Voice Detection
let processor = AudioProcessor()
try processor.startRecordingLive { buffer in
let energy = AudioProcessor.calculateAverageEnergy(of: buffer)
print("Energy: \(energy)")
// Check if voice is present
let hasVoice = AudioProcessor.isVoiceDetected(
in: processor.relativeEnergy,
nextBufferInSeconds: 0.1,
silenceThreshold: 0.3
)
if hasVoice {
print("Voice detected!")
}
}
Batch Loading
let paths = [
"/path/to/audio1.wav",
"/path/to/audio2.mp3",
"/path/to/audio3.m4a"
]
let results = await AudioProcessor.loadAudio(at: paths)
for (index, result) in results.enumerated() {
switch result {
case .success(let audioArray):
print("File \(index): \(audioArray.count) samples")
case .failure(let error):
print("File \(index) failed: \(error)")
}
}
Segment Audio by Silence
let audioArray = try AudioProcessor.loadAudioAsFloatArray(
fromPath: "/path/to/audio.wav"
)
let nonSilentChunks = AudioProcessor.calculateNonSilentChunks(in: audioArray)
for (index, chunk) in nonSilentChunks.enumerated() {
let startTime = Float(chunk.startIndex) / 16000.0
let endTime = Float(chunk.endIndex) / 16000.0
print("Segment \(index): \(startTime)s - \(endTime)s")
}
Get Available Devices (macOS)
#if os(macOS)
let devices = AudioProcessor.getAudioDevices()
for device in devices {
print("Device: \(device.name) (ID: \(device.id))")
}
// Record from specific device
if let device = devices.first {
try processor.startRecordingLive(inputDeviceID: device.id)
}
#endif