Skip to main content

Overview

The AudioProcessor class handles audio loading, conversion, preprocessing, and live recording for WhisperKit. It provides utilities for converting audio to the required format (16kHz mono) and streaming audio from input devices.

Class Definition

open class AudioProcessor: NSObject, AudioProcessing

Initializer

The AudioProcessor uses the default initializer:
let processor = AudioProcessor()

Properties

Audio State

audioSamples
ContiguousArray<Float>
Current buffer of recorded audio samples (16kHz mono)
audioEngine
AVAudioEngine?
The audio engine used for live recording
audioEnergy
[(rel: Float, avg: Float, max: Float, min: Float)]
Array of energy measurements for each audio buffer
relativeEnergy
[Float]
Array of relative energy values (0-1) normalized to the quietest buffer

Configuration

relativeEnergyWindow
Int
default:"20"
Number of past buffers to use for calculating relative energy baseline (~2 seconds at 100ms buffers)
minBufferLength
Int
Minimum buffer length in samples (default: 0.1 seconds = 1600 samples at 16kHz)
audioBufferCallback
(([Float]) -> Void)?
Callback invoked when a new audio buffer is available
isInputSuppressed
Bool
When true, replaces input buffers with silence while maintaining timing

Static Methods

Loading Audio

loadAudio(fromPath:channelMode:startTime:endTime:maxReadFrameSize:)

Loads audio from a file path and converts it to the required format.
public static func loadAudio(
    fromPath audioFilePath: String,
    channelMode: ChannelMode = .sumChannels(nil),
    startTime: Double? = 0,
    endTime: Double? = nil,
    maxReadFrameSize: AVAudioFrameCount? = nil
) throws -> AVAudioPCMBuffer
audioFilePath
String
Path to the audio file
channelMode
ChannelMode
default:".sumChannels(nil)"
How to handle multi-channel audio:
  • .sumChannels(nil) - Mix all channels with peak normalization
  • .sumChannels([0, 1]) - Mix specific channels
  • .specificChannel(0) - Use only one channel
startTime
Double?
default:"0"
Start time in seconds to read from
endTime
Double?
End time in seconds to read until (nil = end of file)
maxReadFrameSize
AVAudioFrameCount?
Maximum frames to read at once (for memory management)
return
AVAudioPCMBuffer
Audio buffer in 16kHz mono float32 format

loadAudioAsFloatArray(fromPath:channelMode:startTime:endTime:)

Loads audio and returns it as a float array.
public static func loadAudioAsFloatArray(
    fromPath audioFilePath: String,
    channelMode: ChannelMode = .sumChannels(nil),
    startTime: Double? = 0,
    endTime: Double? = nil
) throws -> [Float]
return
[Float]
Array of 16kHz mono audio samples

loadAudio(at:channelMode:)

Loads multiple audio files concurrently.
public static func loadAudio(
    at audioPaths: [String],
    channelMode: ChannelMode = .sumChannels(nil)
) async -> [Result<[Float], Swift.Error>]
audioPaths
[String]
Array of audio file paths
return
[Result<[Float], Error>]
Array of results, one per input file

Resampling

resampleAudio(fromFile:toSampleRate:channelCount:channelMode:frameCount:maxReadFrameSize:)

Resamples audio from a file.
public static func resampleAudio(
    fromFile audioFile: AVAudioFile,
    toSampleRate sampleRate: Double,
    channelCount: AVAudioChannelCount,
    channelMode: ChannelMode = .sumChannels(nil),
    frameCount: AVAudioFrameCount? = nil,
    maxReadFrameSize: AVAudioFrameCount = Constants.defaultAudioReadFrameSize
) -> AVAudioPCMBuffer?
audioFile
AVAudioFile
Input audio file
sampleRate
Double
Target sample rate (typically 16000)
channelCount
AVAudioChannelCount
Target channel count (typically 1 for mono)
return
AVAudioPCMBuffer?
Resampled audio buffer

resampleAudio(fromBuffer:toSampleRate:channelCount:)

Resamples an audio buffer.
public static func resampleAudio(
    fromBuffer inputBuffer: AVAudioPCMBuffer,
    toSampleRate sampleRate: Double,
    channelCount: AVAudioChannelCount
) -> AVAudioPCMBuffer?

Channel Conversion

convertToMono(_:mode:)

Converts multi-channel audio to mono.
public static func convertToMono(
    _ buffer: AVAudioPCMBuffer,
    mode: ChannelMode
) -> AVAudioPCMBuffer?
buffer
AVAudioPCMBuffer
Input audio buffer (possibly multi-channel)
mode
ChannelMode
How to convert to mono
return
AVAudioPCMBuffer?
Mono audio buffer

Energy and Voice Activity

calculateAverageEnergy(of:)

Calculates RMS energy of an audio signal.
public static func calculateAverageEnergy(of signal: [Float]) -> Float
return
Float
RMS energy value

calculateEnergy(of:)

Calculates detailed energy metrics.
public static func calculateEnergy(
    of signal: [Float]
) -> (avg: Float, max: Float, min: Float)
return
(avg: Float, max: Float, min: Float)
Tuple containing average (RMS), maximum, and minimum energy values

calculateRelativeEnergy(of:relativeTo:)

Calculates energy relative to a reference baseline.
public static func calculateRelativeEnergy(
    of signal: [Float],
    relativeTo reference: Float?
) -> Float
signal
[Float]
Audio signal to analyze
reference
Float?
Reference energy level (typically the minimum energy in recent buffers)
return
Float
Normalized energy value from 0 to 1

isVoiceDetected(in:nextBufferInSeconds:silenceThreshold:)

Detects if voice is present in audio.
public static func isVoiceDetected(
    in relativeEnergy: [Float],
    nextBufferInSeconds: Float,
    silenceThreshold: Float
) -> Bool
return
Bool
True if voice is detected above the threshold

calculateNonSilentChunks(in:)

Identifies non-silent segments of audio.
public static func calculateNonSilentChunks(
    in signal: [Float]
) -> [(startIndex: Int, endIndex: Int)]
return
[(startIndex: Int, endIndex: Int)]
Array of start/end index pairs for non-silent segments

calculateVoiceActivityInChunks(of:chunkCount:frameLengthSamples:frameOverlapSamples:energyThreshold:)

Calculates voice activity for audio chunks.
public static func calculateVoiceActivityInChunks(
    of signal: [Float],
    chunkCount: Int,
    frameLengthSamples: Int,
    frameOverlapSamples: Int = 0,
    energyThreshold: Float = 0.022
) -> [Bool]
energyThreshold
Float
default:"0.022"
Energy threshold for detecting speech
return
[Bool]
Array indicating voice activity for each chunk

Utility Methods

padOrTrimAudio(fromArray:startAt:toLength:saveSegment:)

Pads or trims audio to a specific length.
public static func padOrTrimAudio(
    fromArray audioArray: [Float],
    startAt startIndex: Int = 0,
    toLength frameLength: Int = 480_000,
    saveSegment: Bool = false
) -> MLMultiArray?
audioArray
[Float]
Input audio samples
startIndex
Int
default:"0"
Starting index in the array
frameLength
Int
default:"480000"
Target length in samples (default is 30 seconds at 16kHz)
return
MLMultiArray?
Padded/trimmed audio as MLMultiArray for Core ML

convertBufferToArray(buffer:chunkSize:)

Converts AVAudioPCMBuffer to float array.
public static func convertBufferToArray(
    buffer: AVAudioPCMBuffer,
    chunkSize: Int = 1024
) -> [Float]
return
[Float]
Array of audio samples

requestRecordPermission()

Requests microphone permission from the user.
public static func requestRecordPermission() async -> Bool
return
Bool
True if permission granted

getAudioDevices() (macOS only)

Returns list of available audio input devices.
public static func getAudioDevices() -> [AudioDevice]
return
[AudioDevice]
Array of available audio input devices

Instance Methods

Recording Control

startRecordingLive(inputDeviceID:callback:)

Starts recording audio from an input device.
public func startRecordingLive(
    inputDeviceID: DeviceID? = nil,
    callback: (([Float]) -> Void)? = nil
) throws
inputDeviceID
DeviceID?
Device ID to record from (nil = default device). Only used on macOS.
callback
(([Float]) -> Void)?
Callback invoked for each audio buffer

startStreamingRecordingLive(inputDeviceID:)

Starts recording and returns an async stream.
public func startStreamingRecordingLive(
    inputDeviceID: DeviceID? = nil
) -> (AsyncThrowingStream<[Float], Error>, AsyncThrowingStream<[Float], Error>.Continuation)
return
(stream: AsyncThrowingStream, continuation: Continuation)
Tuple containing the audio stream and its continuation for cancellation

pauseRecording()

Pauses recording (can be resumed).
public func pauseRecording()

resumeRecordingLive(inputDeviceID:callback:)

Resumes recording after pause.
public func resumeRecordingLive(
    inputDeviceID: DeviceID? = nil,
    callback: (([Float]) -> Void)? = nil
) throws

stopRecording()

Stops recording and releases resources.
public func stopRecording()

Audio Buffer Management

purgeAudioSamples(keepingLast:)

Removes old audio samples, keeping only recent ones.
public func purgeAudioSamples(keepingLast keep: Int)
keep
Int
Number of samples to keep

setInputSuppressed(_:)

Enables or disables input suppression (silent buffers).
public func setInputSuppressed(_ isSuppressed: Bool)
isSuppressed
Bool
If true, replaces input with silence

padOrTrim(fromArray:startAt:toLength:)

Instance method for padding/trimming audio.
open func padOrTrim(
    fromArray audioArray: [Float],
    startAt startIndex: Int,
    toLength frameLength: Int
) -> (any AudioProcessorOutputType)?

Example Usage

Load and Convert Audio

import WhisperKit

// Load audio from file
let audioBuffer = try AudioProcessor.loadAudio(
    fromPath: "/path/to/audio.mp3",
    channelMode: .sumChannels(nil)
)

// Convert to float array
let audioArray = AudioProcessor.convertBufferToArray(buffer: audioBuffer)
print("Loaded \(audioArray.count) samples")

Live Recording with Callback

let processor = AudioProcessor()

// Request permission first
let granted = await AudioProcessor.requestRecordPermission()
guard granted else {
    print("Microphone permission denied")
    return
}

// Start recording
try processor.startRecordingLive { audioBuffer in
    print("Received buffer: \(audioBuffer.count) samples")
    // Process audio buffer
}

// Stop when done
processor.stopRecording()

Streaming with AsyncStream

let processor = AudioProcessor()

let (stream, continuation) = processor.startStreamingRecordingLive()

Task {
    do {
        for try await audioBuffer in stream {
            print("Stream received: \(audioBuffer.count) samples")
            // Process audio
        }
    } catch {
        print("Stream error: \(error)")
    }
}

// Cancel when done
continuation.finish()

Channel Selection

// Use only left channel
let leftChannel = try AudioProcessor.loadAudioAsFloatArray(
    fromPath: "/path/to/stereo.wav",
    channelMode: .specificChannel(0)
)

// Mix only specific channels
let mixed = try AudioProcessor.loadAudioAsFloatArray(
    fromPath: "/path/to/multi.wav",
    channelMode: .sumChannels([0, 2])  // Mix channels 0 and 2
)

Energy-Based Voice Detection

let processor = AudioProcessor()

try processor.startRecordingLive { buffer in
    let energy = AudioProcessor.calculateAverageEnergy(of: buffer)
    print("Energy: \(energy)")
    
    // Check if voice is present
    let hasVoice = AudioProcessor.isVoiceDetected(
        in: processor.relativeEnergy,
        nextBufferInSeconds: 0.1,
        silenceThreshold: 0.3
    )
    
    if hasVoice {
        print("Voice detected!")
    }
}

Batch Loading

let paths = [
    "/path/to/audio1.wav",
    "/path/to/audio2.mp3",
    "/path/to/audio3.m4a"
]

let results = await AudioProcessor.loadAudio(at: paths)

for (index, result) in results.enumerated() {
    switch result {
    case .success(let audioArray):
        print("File \(index): \(audioArray.count) samples")
    case .failure(let error):
        print("File \(index) failed: \(error)")
    }
}

Segment Audio by Silence

let audioArray = try AudioProcessor.loadAudioAsFloatArray(
    fromPath: "/path/to/audio.wav"
)

let nonSilentChunks = AudioProcessor.calculateNonSilentChunks(in: audioArray)

for (index, chunk) in nonSilentChunks.enumerated() {
    let startTime = Float(chunk.startIndex) / 16000.0
    let endTime = Float(chunk.endIndex) / 16000.0
    print("Segment \(index): \(startTime)s - \(endTime)s")
}

Get Available Devices (macOS)

#if os(macOS)
let devices = AudioProcessor.getAudioDevices()

for device in devices {
    print("Device: \(device.name) (ID: \(device.id))")
}

// Record from specific device
if let device = devices.first {
    try processor.startRecordingLive(inputDeviceID: device.id)
}
#endif

Build docs developers (and LLMs) love