Voice Activity Detection
Voice Activity Detection (VAD) identifies segments of audio containing speech versus silence. WhisperKit includes VAD capabilities to improve transcription accuracy, reduce computation, and enable intelligent audio chunking.
VoiceActivityDetector Base Class
The VoiceActivityDetector is a base class that provides common functionality for all VAD implementations:
open class VoiceActivityDetector {
public let sampleRate: Int
public let frameLengthSamples: Int
public let frameOverlapSamples: Int
open func voiceActivity ( in waveform : [ Float ]) -> [ Bool ] {
// Override in subclass
}
}
See VoiceActivityDetector
Properties
Audio sample rate in Hz. WhisperKit uses 16kHz by default.
Length of each analysis frame in samples.
Number of samples overlapping between consecutive frames.
EnergyVAD
WhisperKit includes EnergyVAD, a simple energy-based voice activity detector:
let vad = EnergyVAD (
sampleRate : 16000 ,
frameLength : 0.1 , // 100ms frames
frameOverlap : 0.0 , // No overlap
energyThreshold : 0.02 // Energy threshold
)
let audioSamples: [ Float ] = loadAudio ()
let voiceActivity: [ Bool ] = vad. voiceActivity ( in : audioSamples)
// voiceActivity[i] == true means frame i contains speech
See EnergyVAD
EnergyVAD Initialization
public init (
sampleRate : Int = 16000 ,
frameLength : Float = 0.1 , // Seconds
frameOverlap : Float = 0.0 , // Seconds
energyThreshold : Float = 0.02
)
See EnergyVAD.init
Parameters
Audio sample rate matching WhisperKit.sampleRate.
Frame length in seconds. Default 0.1 = 100ms frames.
Overlap in seconds. Helps catch speech at frame boundaries.
Minimum energy level to consider as speech. Lower values are more sensitive.
Using VAD for Audio Chunking
VAD enables intelligent audio chunking based on speech activity:
import WhisperKit
let whisperKit = try await WhisperKit ()
// Create VAD instance
let vad = EnergyVAD (
frameLength : 0.1 ,
energyThreshold : 0.02
)
// Configure WhisperKit to use VAD
whisperKit. voiceActivityDetector = vad
// Transcribe with VAD-based chunking
var options = DecodingOptions (
chunkingStrategy : . vad // Enable VAD chunking
)
let results = try await whisperKit. transcribe (
audioPath : "long_audio.wav" ,
decodeOptions : options
)
See DecodingOptions.chunkingStrategy
ChunkingStrategy Enum
public enum ChunkingStrategy : String , Codable {
case none // No chunking (default)
case vad // VAD-based chunking
}
VAD Methods
Calculate Active Chunks
Get start/end indices of speech segments:
let vad = EnergyVAD ()
let audioSamples: [ Float ] = loadAudio ()
let activeChunks = vad. calculateActiveChunks ( in : audioSamples)
for chunk in activeChunks {
let startTime = Float (chunk. startIndex ) / Float (vad. sampleRate )
let endTime = Float (chunk. endIndex ) / Float (vad. sampleRate )
print ( "Speech: \( startTime ) s - \( endTime ) s" )
}
See VoiceActivityDetector.calculateActiveChunks
Find Longest Silence
Identify the longest silent period:
let vadResult: [ Bool ] = vad. voiceActivity ( in : audioSamples)
if let silence = vad. findLongestSilence ( in : vadResult) {
let startTime = vad. voiceActivityIndexToSeconds (silence. startIndex )
let endTime = vad. voiceActivityIndexToSeconds (silence. endIndex )
print ( "Longest silence: \( startTime ) s - \( endTime ) s" )
}
See VoiceActivityDetector.findLongestSilence
Voice Activity Clip Timestamps
Generate clip timestamps for active segments:
let clipTimestamps = vad. voiceActivityClipTimestamps ( in : audioSamples)
// Returns [start1, end1, start2, end2, ...]
var options = DecodingOptions (
clipTimestamps : clipTimestamps
)
let results = try await whisperKit. transcribe (
audioArray : audioSamples,
decodeOptions : options
)
See VoiceActivityDetector.voiceActivityClipTimestamps
Index Conversion Utilities
Convert VAD Index to Audio Sample
let vadIndex = 10
let sampleIndex = vad. voiceActivityIndexToAudioSampleIndex (vadIndex)
// sampleIndex = vadIndex * frameLengthSamples
See VoiceActivityDetector.voiceActivityIndexToAudioSampleIndex
Convert VAD Index to Seconds
let vadIndex = 10
let seconds = vad. voiceActivityIndexToSeconds (vadIndex)
See VoiceActivityDetector.voiceActivityIndexToSeconds
VAD in Configuration
Configure VAD in WhisperKitConfig:
let vad = EnergyVAD (
energyThreshold : 0.015 // More sensitive
)
let config = WhisperKitConfig (
model : "base" ,
voiceActivityDetector : vad
)
let whisperKit = try await WhisperKit (config)
See WhisperKitConfig.voiceActivityDetector
Streaming with VAD
AudioStreamTranscriber uses VAD by default:
let streamTranscriber = AudioStreamTranscriber (
// ... other parameters
useVAD : true , // Enable VAD
silenceThreshold : 0.3 // VAD threshold
)
try await streamTranscriber. startStreamTranscription ()
See AudioStreamTranscriber
How Streaming VAD Works
Audio buffer accumulates samples
Relative energy is calculated for recent audio
VAD checks if energy exceeds silenceThreshold
If no voice detected, transcription is skipped
If voice detected, buffer is transcribed
let voiceDetected = AudioProcessor. isVoiceDetected (
in : relativeEnergy,
nextBufferInSeconds : bufferDuration,
silenceThreshold : 0.3
)
if ! voiceDetected {
// Skip transcription for this buffer
}
Custom VAD Implementation
Implement your own VAD by subclassing VoiceActivityDetector:
class MyCustomVAD : VoiceActivityDetector {
private let model: MLModel
init ( model : MLModel) {
self . model = model
super . init (
sampleRate : 16000 ,
frameLengthSamples : 1600 , // 100ms at 16kHz
frameOverlapSamples : 0
)
}
override func voiceActivity ( in waveform : [ Float ]) -> [ Bool ] {
// Custom ML-based VAD logic
var results: [ Bool ] = []
let frameCount = waveform. count / frameLengthSamples
for i in 0 ..< frameCount {
let start = i * frameLengthSamples
let end = min (start + frameLengthSamples, waveform. count )
let frame = Array (waveform[start ..< end])
// Run your ML model or algorithm
let hasVoice = runVADModel ( on : frame)
results. append (hasVoice)
}
return results
}
private func runVADModel ( on frame : [ Float ]) -> Bool {
// Your custom VAD logic here
return true
}
}
Async VAD
For ML models requiring async operations:
class AsyncVAD : VoiceActivityDetector {
override func voiceActivityAsync ( in waveform : [ Float ]) async throws -> [ Bool ] {
// Async VAD logic using ML models
var results: [ Bool ] = []
// Process frames asynchronously
for frame in extractFrames ( from : waveform) {
let hasVoice = try await runAsyncVADModel ( on : frame)
results. append (hasVoice)
}
return results
}
}
See VoiceActivityDetector.voiceActivityAsync
VAD Benefits
Reduced Computation Skip transcription of silent segments, saving CPU/GPU cycles and battery.
Better Accuracy Avoid hallucinations on background noise by only transcribing speech.
Smart Chunking Split long audio at natural silence boundaries instead of arbitrary time points.
Real-time Optimization Streaming transcription skips silent buffers for better responsiveness.
Tuning Energy Threshold
The energyThreshold parameter is critical for VAD performance:
Too Low (e.g., 0.001)
Detects very quiet speech
May trigger on background noise
More false positives
Optimal (e.g., 0.02)
Balances sensitivity and specificity
Good for typical recording conditions
Default value works for most cases
Too High (e.g., 0.1)
Only detects loud speech
May miss quiet speakers
More false negatives
Testing Different Thresholds
let audioSamples: [ Float ] = loadAudio ()
for threshold in [ 0.01 , 0.02 , 0.03 , 0.05 ] {
let vad = EnergyVAD ( energyThreshold : threshold)
let activity = vad. voiceActivity ( in : audioSamples)
let speechFrames = activity. filter { $0 }. count
let totalFrames = activity. count
let speechPct = Float (speechFrames) / Float (totalFrames) * 100
print ( "Threshold \( threshold ) : \( speechPct ) % speech detected" )
}
Complete Example
import WhisperKit
func transcribeWithVAD () async throws {
// Initialize WhisperKit with VAD
let vad = EnergyVAD (
frameLength : 0.1 ,
energyThreshold : 0.02
)
let config = WhisperKitConfig (
model : "base" ,
voiceActivityDetector : vad,
verbose : true
)
let whisperKit = try await WhisperKit (config)
// Load audio
let audioPath = "interview.wav"
let audioSamples = try AudioProcessor. loadAudioAsFloatArray (
fromPath : audioPath
)
// Analyze with VAD
let activeChunks = vad. calculateActiveChunks ( in : audioSamples)
print ( "Found \( activeChunks. count ) speech segments" )
for (i, chunk) in activeChunks. enumerated () {
let start = Float (chunk. startIndex ) / Float (vad. sampleRate )
let end = Float (chunk. endIndex ) / Float (vad. sampleRate )
print ( "Segment \( i ) : \( start ) s - \( end ) s" )
}
// Transcribe with VAD chunking
var options = DecodingOptions (
chunkingStrategy : . vad ,
verbose : true
)
let results = try await whisperKit. transcribe (
audioArray : audioSamples,
decodeOptions : options
)
// Print results
for result in results {
print ( " \n Transcription:" )
print (result. text )
print ( " \n Segments:" )
for segment in result.segments {
print ( "[ \( segment. start ) s - \( segment. end ) s]: \( segment. text ) " )
}
}
}
Next Steps
Streaming Use VAD in real-time streaming transcription
Configuration Advanced configuration options