AudioProcessor - WhisperKit

Overview

The AudioProcessor class handles audio loading, conversion, preprocessing, and live recording for WhisperKit. It provides utilities for converting audio to the required format (16kHz mono) and streaming audio from input devices.

Class Definition

open class AudioProcessor: NSObject, AudioProcessing

Initializer

The AudioProcessor uses the default initializer:

let processor = AudioProcessor()

Properties

Audio State

audioSamples

ContiguousArray<Float>

Current buffer of recorded audio samples (16kHz mono)

audioEngine

AVAudioEngine?

The audio engine used for live recording

audioEnergy

[(rel: Float, avg: Float, max: Float, min: Float)]

Array of energy measurements for each audio buffer

relativeEnergy

[Float]

Array of relative energy values (0-1) normalized to the quietest buffer

Configuration

relativeEnergyWindow

Int

default:"20"

Number of past buffers to use for calculating relative energy baseline (~2 seconds at 100ms buffers)

minBufferLength

Int

Minimum buffer length in samples (default: 0.1 seconds = 1600 samples at 16kHz)

audioBufferCallback

(([Float]) -> Void)?

Callback invoked when a new audio buffer is available

isInputSuppressed

Bool

When true, replaces input buffers with silence while maintaining timing

Static Methods

Loading Audio

loadAudio(fromPath:channelMode:startTime:endTime:maxReadFrameSize:)

Loads audio from a file path and converts it to the required format.

public static func loadAudio(
    fromPath audioFilePath: String,
    channelMode: ChannelMode = .sumChannels(nil),
    startTime: Double? = 0,
    endTime: Double? = nil,
    maxReadFrameSize: AVAudioFrameCount? = nil
) throws -> AVAudioPCMBuffer

audioFilePath

String

Path to the audio file

channelMode

ChannelMode

default:".sumChannels(nil)"

How to handle multi-channel audio:

.sumChannels(nil) - Mix all channels with peak normalization
.sumChannels([0, 1]) - Mix specific channels
.specificChannel(0) - Use only one channel

startTime

Double?

default:"0"

Start time in seconds to read from

endTime

Double?

End time in seconds to read until (nil = end of file)

maxReadFrameSize

AVAudioFrameCount?

Maximum frames to read at once (for memory management)

return

AVAudioPCMBuffer

Audio buffer in 16kHz mono float32 format

loadAudioAsFloatArray(fromPath:channelMode:startTime:endTime:)

Loads audio and returns it as a float array.

public static func loadAudioAsFloatArray(
    fromPath audioFilePath: String,
    channelMode: ChannelMode = .sumChannels(nil),
    startTime: Double? = 0,
    endTime: Double? = nil
) throws -> [Float]

return

[Float]

Array of 16kHz mono audio samples

loadAudio(at:channelMode:)

Loads multiple audio files concurrently.

public static func loadAudio(
    at audioPaths: [String],
    channelMode: ChannelMode = .sumChannels(nil)
) async -> [Result<[Float], Swift.Error>]

audioPaths

[String]

Array of audio file paths

return

[Result<[Float], Error>]

Array of results, one per input file

Resampling

resampleAudio(fromFile:toSampleRate:channelCount:channelMode:frameCount:maxReadFrameSize:)

Resamples audio from a file.

public static func resampleAudio(
    fromFile audioFile: AVAudioFile,
    toSampleRate sampleRate: Double,
    channelCount: AVAudioChannelCount,
    channelMode: ChannelMode = .sumChannels(nil),
    frameCount: AVAudioFrameCount? = nil,
    maxReadFrameSize: AVAudioFrameCount = Constants.defaultAudioReadFrameSize
) -> AVAudioPCMBuffer?

audioFile

AVAudioFile

Input audio file

sampleRate

Double

Target sample rate (typically 16000)

channelCount

AVAudioChannelCount

Target channel count (typically 1 for mono)

return

AVAudioPCMBuffer?

Resampled audio buffer

resampleAudio(fromBuffer:toSampleRate:channelCount:)

Resamples an audio buffer.

public static func resampleAudio(
    fromBuffer inputBuffer: AVAudioPCMBuffer,
    toSampleRate sampleRate: Double,
    channelCount: AVAudioChannelCount
) -> AVAudioPCMBuffer?

Channel Conversion

convertToMono(_:mode:)

Converts multi-channel audio to mono.

public static func convertToMono(
    _ buffer: AVAudioPCMBuffer,
    mode: ChannelMode
) -> AVAudioPCMBuffer?

buffer

AVAudioPCMBuffer

Input audio buffer (possibly multi-channel)

mode

ChannelMode

How to convert to mono

return

AVAudioPCMBuffer?

Mono audio buffer

Energy and Voice Activity

calculateAverageEnergy(of:)

Calculates RMS energy of an audio signal.

public static func calculateAverageEnergy(of signal: [Float]) -> Float

return

Float

RMS energy value

calculateEnergy(of:)

Calculates detailed energy metrics.

public static func calculateEnergy(
    of signal: [Float]
) -> (avg: Float, max: Float, min: Float)

return

(avg: Float, max: Float, min: Float)

Tuple containing average (RMS), maximum, and minimum energy values

calculateRelativeEnergy(of:relativeTo:)

Calculates energy relative to a reference baseline.

public static func calculateRelativeEnergy(
    of signal: [Float],
    relativeTo reference: Float?
) -> Float

signal

[Float]

Audio signal to analyze

reference

Float?

Reference energy level (typically the minimum energy in recent buffers)

return

Float

Normalized energy value from 0 to 1

isVoiceDetected(in:nextBufferInSeconds:silenceThreshold:)

Detects if voice is present in audio.

public static func isVoiceDetected(
    in relativeEnergy: [Float],
    nextBufferInSeconds: Float,
    silenceThreshold: Float
) -> Bool

return

Bool

True if voice is detected above the threshold

calculateNonSilentChunks(in:)

Identifies non-silent segments of audio.

public static func calculateNonSilentChunks(
    in signal: [Float]
) -> [(startIndex: Int, endIndex: Int)]

return

[(startIndex: Int, endIndex: Int)]

Array of start/end index pairs for non-silent segments

calculateVoiceActivityInChunks(of:chunkCount:frameLengthSamples:frameOverlapSamples:energyThreshold:)

Calculates voice activity for audio chunks.

public static func calculateVoiceActivityInChunks(
    of signal: [Float],
    chunkCount: Int,
    frameLengthSamples: Int,
    frameOverlapSamples: Int = 0,
    energyThreshold: Float = 0.022
) -> [Bool]

energyThreshold

Float

default:"0.022"

Energy threshold for detecting speech

return

[Bool]

Array indicating voice activity for each chunk

Utility Methods

padOrTrimAudio(fromArray:startAt:toLength:saveSegment:)

Pads or trims audio to a specific length.

public static func padOrTrimAudio(
    fromArray audioArray: [Float],
    startAt startIndex: Int = 0,
    toLength frameLength: Int = 480_000,
    saveSegment: Bool = false
) -> MLMultiArray?

audioArray

[Float]

Input audio samples

startIndex

Int

default:"0"

Starting index in the array

frameLength

Int

default:"480000"

Target length in samples (default is 30 seconds at 16kHz)

return

MLMultiArray?

Padded/trimmed audio as MLMultiArray for Core ML

convertBufferToArray(buffer:chunkSize:)

Converts AVAudioPCMBuffer to float array.

public static func convertBufferToArray(
    buffer: AVAudioPCMBuffer,
    chunkSize: Int = 1024
) -> [Float]

return

[Float]

Array of audio samples

requestRecordPermission()

Requests microphone permission from the user.

public static func requestRecordPermission() async -> Bool

return

Bool

True if permission granted

getAudioDevices() (macOS only)

Returns list of available audio input devices.

public static func getAudioDevices() -> [AudioDevice]

return

[AudioDevice]

Array of available audio input devices

Instance Methods

Recording Control

startRecordingLive(inputDeviceID:callback:)

Starts recording audio from an input device.

public func startRecordingLive(
    inputDeviceID: DeviceID? = nil,
    callback: (([Float]) -> Void)? = nil
) throws

inputDeviceID

DeviceID?

Device ID to record from (nil = default device). Only used on macOS.

callback

(([Float]) -> Void)?

Callback invoked for each audio buffer

startStreamingRecordingLive(inputDeviceID:)

Starts recording and returns an async stream.

public func startStreamingRecordingLive(
    inputDeviceID: DeviceID? = nil
) -> (AsyncThrowingStream<[Float], Error>, AsyncThrowingStream<[Float], Error>.Continuation)

return

(stream: AsyncThrowingStream, continuation: Continuation)

Tuple containing the audio stream and its continuation for cancellation

pauseRecording()

Pauses recording (can be resumed).

public func pauseRecording()

resumeRecordingLive(inputDeviceID:callback:)

Resumes recording after pause.

public func resumeRecordingLive(
    inputDeviceID: DeviceID? = nil,
    callback: (([Float]) -> Void)? = nil
) throws

stopRecording()

Stops recording and releases resources.

public func stopRecording()

Audio Buffer Management

purgeAudioSamples(keepingLast:)

Removes old audio samples, keeping only recent ones.

public func purgeAudioSamples(keepingLast keep: Int)

keep

Int

Number of samples to keep

setInputSuppressed(_:)

Enables or disables input suppression (silent buffers).

public func setInputSuppressed(_ isSuppressed: Bool)

isSuppressed

Bool

If true, replaces input with silence

padOrTrim(fromArray:startAt:toLength:)

Instance method for padding/trimming audio.

open func padOrTrim(
    fromArray audioArray: [Float],
    startAt startIndex: Int,
    toLength frameLength: Int
) -> (any AudioProcessorOutputType)?

Example Usage

Load and Convert Audio

import WhisperKit

// Load audio from file
let audioBuffer = try AudioProcessor.loadAudio(
    fromPath: "/path/to/audio.mp3",
    channelMode: .sumChannels(nil)
)

// Convert to float array
let audioArray = AudioProcessor.convertBufferToArray(buffer: audioBuffer)
print("Loaded \(audioArray.count) samples")

Live Recording with Callback

let processor = AudioProcessor()

// Request permission first
let granted = await AudioProcessor.requestRecordPermission()
guard granted else {
    print("Microphone permission denied")
    return
}

// Start recording
try processor.startRecordingLive { audioBuffer in
    print("Received buffer: \(audioBuffer.count) samples")
    // Process audio buffer
}

// Stop when done
processor.stopRecording()

Streaming with AsyncStream

let processor = AudioProcessor()

let (stream, continuation) = processor.startStreamingRecordingLive()

Task {
    do {
        for try await audioBuffer in stream {
            print("Stream received: \(audioBuffer.count) samples")
            // Process audio
        }
    } catch {
        print("Stream error: \(error)")
    }
}

// Cancel when done
continuation.finish()

Channel Selection

// Use only left channel
let leftChannel = try AudioProcessor.loadAudioAsFloatArray(
    fromPath: "/path/to/stereo.wav",
    channelMode: .specificChannel(0)
)

// Mix only specific channels
let mixed = try AudioProcessor.loadAudioAsFloatArray(
    fromPath: "/path/to/multi.wav",
    channelMode: .sumChannels([0, 2])  // Mix channels 0 and 2
)

Energy-Based Voice Detection

let processor = AudioProcessor()

try processor.startRecordingLive { buffer in
    let energy = AudioProcessor.calculateAverageEnergy(of: buffer)
    print("Energy: \(energy)")
    
    // Check if voice is present
    let hasVoice = AudioProcessor.isVoiceDetected(
        in: processor.relativeEnergy,
        nextBufferInSeconds: 0.1,
        silenceThreshold: 0.3
    )
    
    if hasVoice {
        print("Voice detected!")
    }
}

Batch Loading

let paths = [
    "/path/to/audio1.wav",
    "/path/to/audio2.mp3",
    "/path/to/audio3.m4a"
]

let results = await AudioProcessor.loadAudio(at: paths)

for (index, result) in results.enumerated() {
    switch result {
    case .success(let audioArray):
        print("File \(index): \(audioArray.count) samples")
    case .failure(let error):
        print("File \(index) failed: \(error)")
    }
}

Segment Audio by Silence

let audioArray = try AudioProcessor.loadAudioAsFloatArray(
    fromPath: "/path/to/audio.wav"
)

let nonSilentChunks = AudioProcessor.calculateNonSilentChunks(in: audioArray)

for (index, chunk) in nonSilentChunks.enumerated() {
    let startTime = Float(chunk.startIndex) / 16000.0
    let endTime = Float(chunk.endIndex) / 16000.0
    print("Segment \(index): \(startTime)s - \(endTime)s")
}

Get Available Devices (macOS)

#if os(macOS)
let devices = AudioProcessor.getAudioDevices()

for device in devices {
    print("Device: \(device.name) (ID: \(device.id))")
}

// Record from specific device
if let device = devices.first {
    try processor.startRecordingLive(inputDeviceID: device.id)
}
#endif

WhisperKit

TTSKit

Core Types

Documentation Index

​Overview

​Class Definition

​Initializer

​Properties

​Audio State

​Configuration

​Static Methods

​Loading Audio

​loadAudio(fromPath:channelMode:startTime:endTime:maxReadFrameSize:)

​loadAudioAsFloatArray(fromPath:channelMode:startTime:endTime:)

​loadAudio(at:channelMode:)

​Resampling

​resampleAudio(fromFile:toSampleRate:channelCount:channelMode:frameCount:maxReadFrameSize:)

​resampleAudio(fromBuffer:toSampleRate:channelCount:)

​Channel Conversion

​convertToMono(_:mode:)

​Energy and Voice Activity

​calculateAverageEnergy(of:)

​calculateEnergy(of:)

​calculateRelativeEnergy(of:relativeTo:)

​isVoiceDetected(in:nextBufferInSeconds:silenceThreshold:)

​calculateNonSilentChunks(in:)

​calculateVoiceActivityInChunks(of:chunkCount:frameLengthSamples:frameOverlapSamples:energyThreshold:)

​Utility Methods

​padOrTrimAudio(fromArray:startAt:toLength:saveSegment:)

​convertBufferToArray(buffer:chunkSize:)

​requestRecordPermission()

​getAudioDevices() (macOS only)

​Instance Methods

​Recording Control

​startRecordingLive(inputDeviceID:callback:)

​startStreamingRecordingLive(inputDeviceID:)

​pauseRecording()

​resumeRecordingLive(inputDeviceID:callback:)

​stopRecording()

​Audio Buffer Management

​purgeAudioSamples(keepingLast:)

​setInputSuppressed(_:)

​padOrTrim(fromArray:startAt:toLength:)

​Example Usage

​Load and Convert Audio

​Live Recording with Callback

​Streaming with AsyncStream

​Channel Selection

​Energy-Based Voice Detection

​Batch Loading

​Segment Audio by Silence

​Get Available Devices (macOS)

Build docs developers (and LLMs) love

Overview

Class Definition

Initializer

Properties

Audio State

Configuration

Static Methods

Loading Audio

loadAudio(fromPath:channelMode:startTime:endTime:maxReadFrameSize:)

loadAudioAsFloatArray(fromPath:channelMode:startTime:endTime:)

loadAudio(at:channelMode:)

Resampling

resampleAudio(fromFile:toSampleRate:channelCount:channelMode:frameCount:maxReadFrameSize:)

resampleAudio(fromBuffer:toSampleRate:channelCount:)

Channel Conversion

convertToMono(_:mode:)

Energy and Voice Activity

calculateAverageEnergy(of:)

calculateEnergy(of:)

calculateRelativeEnergy(of:relativeTo:)

isVoiceDetected(in:nextBufferInSeconds:silenceThreshold:)

calculateNonSilentChunks(in:)

calculateVoiceActivityInChunks(of:chunkCount:frameLengthSamples:frameOverlapSamples:energyThreshold:)

Utility Methods

padOrTrimAudio(fromArray:startAt:toLength:saveSegment:)

convertBufferToArray(buffer:chunkSize:)

requestRecordPermission()

getAudioDevices() (macOS only)

Instance Methods

Recording Control

startRecordingLive(inputDeviceID:callback:)

startStreamingRecordingLive(inputDeviceID:)

pauseRecording()

resumeRecordingLive(inputDeviceID:callback:)

stopRecording()

Audio Buffer Management

purgeAudioSamples(keepingLast:)

setInputSuppressed(_:)

padOrTrim(fromArray:startAt:toLength:)

Example Usage

Load and Convert Audio

Live Recording with Callback

Streaming with AsyncStream

Channel Selection

Energy-Based Voice Detection

Batch Loading

Segment Audio by Silence

Get Available Devices (macOS)