Voice Activity Detection

Overview

Voice Activity Detection (VAD) identifies speech segments in audio, filtering out silence and noise. This is useful for:

Preprocessing audio before transcription
Real-time speech detection in voice assistants
Audio segmentation and chunking

Cactus uses the Silero VAD model for accurate, low-latency detection.

cactus_vad

Detect speech segments in audio.

int cactus_vad(
    cactus_model_t model,
    const char* audio_file_path,
    char* response_buffer,
    size_t buffer_size,
    const char* options_json,
    const uint8_t* pcm_buffer,
    size_t pcm_buffer_size
);

model

cactus_model_t

required

Silero VAD model handle from cactus_init

audio_file_path

string

Path to WAV file. NULL if using pcm_buffer

response_buffer

char*

required

Buffer to write JSON response

buffer_size

size_t

required

Size of response buffer

options_json

string

Optional JSON object with VAD parameters

pcm_buffer

uint8_t*

Raw PCM audio (16-bit mono 16kHz). NULL if using audio_file_path

pcm_buffer_size

size_t

Size of PCM buffer in bytes

return

int

Number of bytes written to response_buffer on success, -1 on error

Options JSON

{
  "threshold": 0.5,
  "neg_threshold": 0.35,
  "min_speech_duration_ms": 250,
  "max_speech_duration_s": 30.0,
  "min_silence_duration_ms": 100,
  "speech_pad_ms": 30,
  "window_size_samples": 512,
  "min_silence_at_max_speech": 100,
  "use_max_poss_sil_at_max_speech": false,
  "sampling_rate": 16000
}

threshold

float

default:"0.5"

Speech detection probability threshold (0.0-1.0)

neg_threshold

float

default:"0.35"

Non-speech probability threshold for ending segments

min_speech_duration_ms

int

default:"250"

Minimum duration of speech segment in milliseconds

max_speech_duration_s

float

default:"30.0"

Maximum duration of speech segment in seconds

min_silence_duration_ms

int

default:"100"

Minimum silence duration to split segments

speech_pad_ms

int

default:"30"

Padding to add at start/end of speech segments

window_size_samples

int

default:"512"

Analysis window size (must be 512 or 1024)

sampling_rate

int

default:"16000"

Audio sample rate in Hz

Response Format

{
  "success": true,
  "error": null,
  "segments": [
    {
      "start": 0,
      "end": 24000
    },
    {
      "start": 32000,
      "end": 56000
    }
  ],
  "total_time_ms": 12.5,
  "ram_usage_mb": 45.2
}

success

bool

Whether VAD processing succeeded

error

string | null

Error message if failed

segments

array

Array of speech segments with start/end sample indices

segments[].start

int

Start sample index (at 16kHz)

segments[].end

int

End sample index (at 16kHz)

total_time_ms

float

Processing time in milliseconds

ram_usage_mb

float

Memory usage in megabytes

Example: Detect Speech

#include "cactus_ffi.h"
#include <stdio.h>

int main() {
    // Load Silero VAD model
    cactus_model_t model = cactus_init("/path/to/silero-vad", NULL, false);
    if (!model) {
        fprintf(stderr, "Failed to load VAD model\n");
        return 1;
    }
    
    // Configure VAD
    const char* options = "{"
        "\"threshold\":0.5,"
        "\"min_speech_duration_ms\":250,"
        "\"min_silence_duration_ms\":100"
    "}";
    
    char response[4096];
    int result = cactus_vad(
        model,
        "/path/to/audio.wav",
        response,
        sizeof(response),
        options,
        NULL, 0
    );
    
    if (result > 0) {
        printf("VAD result:\n%s\n", response);
    } else {
        printf("Error: %s\n", cactus_get_last_error());
    }
    
    cactus_destroy(model);
    return 0;
}

Example: VAD + Transcription

#include "cactus_ffi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void transcribe_with_vad(const char* audio_path) {
    // Load models
    cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
    cactus_model_t asr = cactus_init("/path/to/whisper", NULL, false);
    
    // Run VAD
    char vad_response[8192];
    cactus_vad(vad, audio_path, vad_response, sizeof(vad_response), NULL, NULL, 0);
    
    // Parse segments (simplified - use JSON parser in production)
    // Extract start/end sample indices from vad_response
    
    // Transcribe with VAD-enabled option
    const char* prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>";
    const char* options = "{\"use_vad\":true}";
    
    char transcript[16384];
    int result = cactus_transcribe(
        asr,
        audio_path,
        prompt,
        transcript,
        sizeof(transcript),
        options,
        NULL, NULL,
        NULL, 0
    );
    
    if (result > 0) {
        printf("Transcription:\n%s\n", transcript);
    }
    
    cactus_destroy(vad);
    cactus_destroy(asr);
}

Example: Real-time VAD

#include "cactus_ffi.h"
#include <stdbool.h>

typedef struct {
    bool in_speech;
    size_t speech_start;
    size_t current_sample;
} VADState;

void process_audio_chunk(
    cactus_model_t vad,
    const int16_t* pcm_samples,
    size_t num_samples,
    VADState* state
) {
    // Convert to uint8_t buffer
    size_t buffer_size = num_samples * 2;
    uint8_t* pcm_buffer = (uint8_t*)pcm_samples;
    
    char response[4096];
    int result = cactus_vad(
        vad,
        NULL,
        response,
        sizeof(response),
        NULL,
        pcm_buffer,
        buffer_size
    );
    
    if (result > 0) {
        // Parse response to check if speech detected
        // Update state->in_speech, state->speech_start
        
        if (state->in_speech) {
            printf("Speech detected at sample %zu\n", state->current_sample);
        }
    }
    
    state->current_sample += num_samples;
}

int main() {
    cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
    
    VADState state = {.in_speech = false, .speech_start = 0, .current_sample = 0};
    
    // Process audio stream in chunks
    while (has_audio_data()) {
        int16_t chunk[8000];  // 500ms at 16kHz
        size_t chunk_size = read_audio_chunk(chunk, 8000);
        process_audio_chunk(vad, chunk, chunk_size, &state);
    }
    
    cactus_destroy(vad);
    return 0;
}

Converting Samples to Time

At 16kHz sample rate:

// Sample index to milliseconds
float sample_to_ms(size_t sample_idx) {
    return (float)sample_idx / 16.0f;
}

// Sample index to seconds
float sample_to_sec(size_t sample_idx) {
    return (float)sample_idx / 16000.0f;
}

// Milliseconds to sample index
size_t ms_to_sample(float ms) {
    return (size_t)(ms * 16.0f);
}

Audio Format Requirements

VAD requires:

Sample rate: 16 kHz
Channels: Mono (1 channel)
Format: 16-bit signed PCM

WAV files are automatically resampled. Raw PCM buffers must already be 16kHz mono.

Tuning Parameters

High Precision (fewer false positives)

{
  "threshold": 0.7,
  "neg_threshold": 0.5,
  "min_speech_duration_ms": 500
}

High Recall (fewer missed segments)

{
  "threshold": 0.3,
  "neg_threshold": 0.2,
  "min_speech_duration_ms": 100
}

Noisy Environment

{
  "threshold": 0.6,
  "speech_pad_ms": 50,
  "min_silence_duration_ms": 200
}

Performance

Latency: ~2ms per 512-sample window (32ms audio)
Throughput: ~500x real-time on CPU
Memory: ~45 MB

Transcription API

Speech-to-text with VAD

Python SDK

Python VAD API

Transcription Guide

Integrate VAD with ASR

Core APIs

Features

Voice Activity Detection

Overview

cactus_vad

Options JSON

Response Format

Example: Detect Speech

Example: VAD + Transcription

Example: Real-time VAD

Converting Samples to Time

Audio Format Requirements

Tuning Parameters

High Precision (fewer false positives)

High Recall (fewer missed segments)

Noisy Environment

Performance

See Also

Transcription API

Python SDK

Transcription Guide

Build docs developers (and LLMs) love

Core APIs

Features

Documentation Index

​Overview

​cactus_vad

​Options JSON

​Response Format

​Example: Detect Speech

​Example: VAD + Transcription

​Example: Real-time VAD

​Converting Samples to Time

​Audio Format Requirements

​Tuning Parameters

​High Precision (fewer false positives)

​High Recall (fewer missed segments)

​Noisy Environment

​Performance

​See Also

Transcription API

Python SDK

Transcription Guide

Build docs developers (and LLMs) love

Overview

cactus_vad

Options JSON

Response Format

Example: Detect Speech

Example: VAD + Transcription

Example: Real-time VAD

Converting Samples to Time

Audio Format Requirements

Tuning Parameters

High Precision (fewer false positives)

High Recall (fewer missed segments)

Noisy Environment

Performance

See Also