Skip to main content

Overview

Voice Activity Detection (VAD) identifies speech segments in audio, filtering out silence and noise. This is useful for:
  • Preprocessing audio before transcription
  • Real-time speech detection in voice assistants
  • Audio segmentation and chunking
Cactus uses the Silero VAD model for accurate, low-latency detection.

cactus_vad

Detect speech segments in audio.
int cactus_vad(
    cactus_model_t model,
    const char* audio_file_path,
    char* response_buffer,
    size_t buffer_size,
    const char* options_json,
    const uint8_t* pcm_buffer,
    size_t pcm_buffer_size
);
model
cactus_model_t
required
Silero VAD model handle from cactus_init
audio_file_path
string
Path to WAV file. NULL if using pcm_buffer
response_buffer
char*
required
Buffer to write JSON response
buffer_size
size_t
required
Size of response buffer
options_json
string
Optional JSON object with VAD parameters
pcm_buffer
uint8_t*
Raw PCM audio (16-bit mono 16kHz). NULL if using audio_file_path
pcm_buffer_size
size_t
Size of PCM buffer in bytes
return
int
Number of bytes written to response_buffer on success, -1 on error

Options JSON

{
  "threshold": 0.5,
  "neg_threshold": 0.35,
  "min_speech_duration_ms": 250,
  "max_speech_duration_s": 30.0,
  "min_silence_duration_ms": 100,
  "speech_pad_ms": 30,
  "window_size_samples": 512,
  "min_silence_at_max_speech": 100,
  "use_max_poss_sil_at_max_speech": false,
  "sampling_rate": 16000
}
threshold
float
default:"0.5"
Speech detection probability threshold (0.0-1.0)
neg_threshold
float
default:"0.35"
Non-speech probability threshold for ending segments
min_speech_duration_ms
int
default:"250"
Minimum duration of speech segment in milliseconds
max_speech_duration_s
float
default:"30.0"
Maximum duration of speech segment in seconds
min_silence_duration_ms
int
default:"100"
Minimum silence duration to split segments
speech_pad_ms
int
default:"30"
Padding to add at start/end of speech segments
window_size_samples
int
default:"512"
Analysis window size (must be 512 or 1024)
sampling_rate
int
default:"16000"
Audio sample rate in Hz

Response Format

{
  "success": true,
  "error": null,
  "segments": [
    {
      "start": 0,
      "end": 24000
    },
    {
      "start": 32000,
      "end": 56000
    }
  ],
  "total_time_ms": 12.5,
  "ram_usage_mb": 45.2
}
success
bool
Whether VAD processing succeeded
error
string | null
Error message if failed
segments
array
Array of speech segments with start/end sample indices
segments[].start
int
Start sample index (at 16kHz)
segments[].end
int
End sample index (at 16kHz)
total_time_ms
float
Processing time in milliseconds
ram_usage_mb
float
Memory usage in megabytes

Example: Detect Speech

#include "cactus_ffi.h"
#include <stdio.h>

int main() {
    // Load Silero VAD model
    cactus_model_t model = cactus_init("/path/to/silero-vad", NULL, false);
    if (!model) {
        fprintf(stderr, "Failed to load VAD model\n");
        return 1;
    }
    
    // Configure VAD
    const char* options = "{"
        "\"threshold\":0.5,"
        "\"min_speech_duration_ms\":250,"
        "\"min_silence_duration_ms\":100"
    "}";
    
    char response[4096];
    int result = cactus_vad(
        model,
        "/path/to/audio.wav",
        response,
        sizeof(response),
        options,
        NULL, 0
    );
    
    if (result > 0) {
        printf("VAD result:\n%s\n", response);
    } else {
        printf("Error: %s\n", cactus_get_last_error());
    }
    
    cactus_destroy(model);
    return 0;
}

Example: VAD + Transcription

#include "cactus_ffi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void transcribe_with_vad(const char* audio_path) {
    // Load models
    cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
    cactus_model_t asr = cactus_init("/path/to/whisper", NULL, false);
    
    // Run VAD
    char vad_response[8192];
    cactus_vad(vad, audio_path, vad_response, sizeof(vad_response), NULL, NULL, 0);
    
    // Parse segments (simplified - use JSON parser in production)
    // Extract start/end sample indices from vad_response
    
    // Transcribe with VAD-enabled option
    const char* prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>";
    const char* options = "{\"use_vad\":true}";
    
    char transcript[16384];
    int result = cactus_transcribe(
        asr,
        audio_path,
        prompt,
        transcript,
        sizeof(transcript),
        options,
        NULL, NULL,
        NULL, 0
    );
    
    if (result > 0) {
        printf("Transcription:\n%s\n", transcript);
    }
    
    cactus_destroy(vad);
    cactus_destroy(asr);
}

Example: Real-time VAD

#include "cactus_ffi.h"
#include <stdbool.h>

typedef struct {
    bool in_speech;
    size_t speech_start;
    size_t current_sample;
} VADState;

void process_audio_chunk(
    cactus_model_t vad,
    const int16_t* pcm_samples,
    size_t num_samples,
    VADState* state
) {
    // Convert to uint8_t buffer
    size_t buffer_size = num_samples * 2;
    uint8_t* pcm_buffer = (uint8_t*)pcm_samples;
    
    char response[4096];
    int result = cactus_vad(
        vad,
        NULL,
        response,
        sizeof(response),
        NULL,
        pcm_buffer,
        buffer_size
    );
    
    if (result > 0) {
        // Parse response to check if speech detected
        // Update state->in_speech, state->speech_start
        
        if (state->in_speech) {
            printf("Speech detected at sample %zu\n", state->current_sample);
        }
    }
    
    state->current_sample += num_samples;
}

int main() {
    cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
    
    VADState state = {.in_speech = false, .speech_start = 0, .current_sample = 0};
    
    // Process audio stream in chunks
    while (has_audio_data()) {
        int16_t chunk[8000];  // 500ms at 16kHz
        size_t chunk_size = read_audio_chunk(chunk, 8000);
        process_audio_chunk(vad, chunk, chunk_size, &state);
    }
    
    cactus_destroy(vad);
    return 0;
}

Converting Samples to Time

At 16kHz sample rate:
// Sample index to milliseconds
float sample_to_ms(size_t sample_idx) {
    return (float)sample_idx / 16.0f;
}

// Sample index to seconds
float sample_to_sec(size_t sample_idx) {
    return (float)sample_idx / 16000.0f;
}

// Milliseconds to sample index
size_t ms_to_sample(float ms) {
    return (size_t)(ms * 16.0f);
}

Audio Format Requirements

VAD requires:
  • Sample rate: 16 kHz
  • Channels: Mono (1 channel)
  • Format: 16-bit signed PCM
WAV files are automatically resampled. Raw PCM buffers must already be 16kHz mono.

Tuning Parameters

High Precision (fewer false positives)

{
  "threshold": 0.7,
  "neg_threshold": 0.5,
  "min_speech_duration_ms": 500
}

High Recall (fewer missed segments)

{
  "threshold": 0.3,
  "neg_threshold": 0.2,
  "min_speech_duration_ms": 100
}

Noisy Environment

{
  "threshold": 0.6,
  "speech_pad_ms": 50,
  "min_silence_duration_ms": 200
}

Performance

  • Latency: ~2ms per 512-sample window (32ms audio)
  • Throughput: ~500x real-time on CPU
  • Memory: ~45 MB

See Also

Transcription API

Speech-to-text with VAD

Python SDK

Python VAD API

Transcription Guide

Integrate VAD with ASR

Build docs developers (and LLMs) love