Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/cactus-compute/cactus/llms.txt

Use this file to discover all available pages before exploring further.

Overview

Voice Activity Detection (VAD) identifies speech segments in audio, filtering out silence and noise. This is useful for:
  • Preprocessing audio before transcription
  • Real-time speech detection in voice assistants
  • Audio segmentation and chunking
Cactus uses the Silero VAD model for accurate, low-latency detection.

cactus_vad

Detect speech segments in audio.
int cactus_vad(
    cactus_model_t model,
    const char* audio_file_path,
    char* response_buffer,
    size_t buffer_size,
    const char* options_json,
    const uint8_t* pcm_buffer,
    size_t pcm_buffer_size
);
model
cactus_model_t
required
Silero VAD model handle from cactus_init
audio_file_path
string
Path to WAV file. NULL if using pcm_buffer
response_buffer
char*
required
Buffer to write JSON response
buffer_size
size_t
required
Size of response buffer
options_json
string
Optional JSON object with VAD parameters
pcm_buffer
uint8_t*
Raw PCM audio (16-bit mono 16kHz). NULL if using audio_file_path
pcm_buffer_size
size_t
Size of PCM buffer in bytes
return
int
Number of bytes written to response_buffer on success, -1 on error

Options JSON

{
  "threshold": 0.5,
  "neg_threshold": 0.35,
  "min_speech_duration_ms": 250,
  "max_speech_duration_s": 30.0,
  "min_silence_duration_ms": 100,
  "speech_pad_ms": 30,
  "window_size_samples": 512,
  "min_silence_at_max_speech": 100,
  "use_max_poss_sil_at_max_speech": false,
  "sampling_rate": 16000
}
threshold
float
default:"0.5"
Speech detection probability threshold (0.0-1.0)
neg_threshold
float
default:"0.35"
Non-speech probability threshold for ending segments
min_speech_duration_ms
int
default:"250"
Minimum duration of speech segment in milliseconds
max_speech_duration_s
float
default:"30.0"
Maximum duration of speech segment in seconds
min_silence_duration_ms
int
default:"100"
Minimum silence duration to split segments
speech_pad_ms
int
default:"30"
Padding to add at start/end of speech segments
window_size_samples
int
default:"512"
Analysis window size (must be 512 or 1024)
sampling_rate
int
default:"16000"
Audio sample rate in Hz

Response Format

{
  "success": true,
  "error": null,
  "segments": [
    {
      "start": 0,
      "end": 24000
    },
    {
      "start": 32000,
      "end": 56000
    }
  ],
  "total_time_ms": 12.5,
  "ram_usage_mb": 45.2
}
success
bool
Whether VAD processing succeeded
error
string | null
Error message if failed
segments
array
Array of speech segments with start/end sample indices
segments[].start
int
Start sample index (at 16kHz)
segments[].end
int
End sample index (at 16kHz)
total_time_ms
float
Processing time in milliseconds
ram_usage_mb
float
Memory usage in megabytes

Example: Detect Speech

#include "cactus_ffi.h"
#include <stdio.h>

int main() {
    // Load Silero VAD model
    cactus_model_t model = cactus_init("/path/to/silero-vad", NULL, false);
    if (!model) {
        fprintf(stderr, "Failed to load VAD model\n");
        return 1;
    }
    
    // Configure VAD
    const char* options = "{"
        "\"threshold\":0.5,"
        "\"min_speech_duration_ms\":250,"
        "\"min_silence_duration_ms\":100"
    "}";
    
    char response[4096];
    int result = cactus_vad(
        model,
        "/path/to/audio.wav",
        response,
        sizeof(response),
        options,
        NULL, 0
    );
    
    if (result > 0) {
        printf("VAD result:\n%s\n", response);
    } else {
        printf("Error: %s\n", cactus_get_last_error());
    }
    
    cactus_destroy(model);
    return 0;
}

Example: VAD + Transcription

#include "cactus_ffi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void transcribe_with_vad(const char* audio_path) {
    // Load models
    cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
    cactus_model_t asr = cactus_init("/path/to/whisper", NULL, false);
    
    // Run VAD
    char vad_response[8192];
    cactus_vad(vad, audio_path, vad_response, sizeof(vad_response), NULL, NULL, 0);
    
    // Parse segments (simplified - use JSON parser in production)
    // Extract start/end sample indices from vad_response
    
    // Transcribe with VAD-enabled option
    const char* prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>";
    const char* options = "{\"use_vad\":true}";
    
    char transcript[16384];
    int result = cactus_transcribe(
        asr,
        audio_path,
        prompt,
        transcript,
        sizeof(transcript),
        options,
        NULL, NULL,
        NULL, 0
    );
    
    if (result > 0) {
        printf("Transcription:\n%s\n", transcript);
    }
    
    cactus_destroy(vad);
    cactus_destroy(asr);
}

Example: Real-time VAD

#include "cactus_ffi.h"
#include <stdbool.h>

typedef struct {
    bool in_speech;
    size_t speech_start;
    size_t current_sample;
} VADState;

void process_audio_chunk(
    cactus_model_t vad,
    const int16_t* pcm_samples,
    size_t num_samples,
    VADState* state
) {
    // Convert to uint8_t buffer
    size_t buffer_size = num_samples * 2;
    uint8_t* pcm_buffer = (uint8_t*)pcm_samples;
    
    char response[4096];
    int result = cactus_vad(
        vad,
        NULL,
        response,
        sizeof(response),
        NULL,
        pcm_buffer,
        buffer_size
    );
    
    if (result > 0) {
        // Parse response to check if speech detected
        // Update state->in_speech, state->speech_start
        
        if (state->in_speech) {
            printf("Speech detected at sample %zu\n", state->current_sample);
        }
    }
    
    state->current_sample += num_samples;
}

int main() {
    cactus_model_t vad = cactus_init("/path/to/silero-vad", NULL, false);
    
    VADState state = {.in_speech = false, .speech_start = 0, .current_sample = 0};
    
    // Process audio stream in chunks
    while (has_audio_data()) {
        int16_t chunk[8000];  // 500ms at 16kHz
        size_t chunk_size = read_audio_chunk(chunk, 8000);
        process_audio_chunk(vad, chunk, chunk_size, &state);
    }
    
    cactus_destroy(vad);
    return 0;
}

Converting Samples to Time

At 16kHz sample rate:
// Sample index to milliseconds
float sample_to_ms(size_t sample_idx) {
    return (float)sample_idx / 16.0f;
}

// Sample index to seconds
float sample_to_sec(size_t sample_idx) {
    return (float)sample_idx / 16000.0f;
}

// Milliseconds to sample index
size_t ms_to_sample(float ms) {
    return (size_t)(ms * 16.0f);
}

Audio Format Requirements

VAD requires:
  • Sample rate: 16 kHz
  • Channels: Mono (1 channel)
  • Format: 16-bit signed PCM
WAV files are automatically resampled. Raw PCM buffers must already be 16kHz mono.

Tuning Parameters

High Precision (fewer false positives)

{
  "threshold": 0.7,
  "neg_threshold": 0.5,
  "min_speech_duration_ms": 500
}

High Recall (fewer missed segments)

{
  "threshold": 0.3,
  "neg_threshold": 0.2,
  "min_speech_duration_ms": 100
}

Noisy Environment

{
  "threshold": 0.6,
  "speech_pad_ms": 50,
  "min_silence_duration_ms": 200
}

Performance

  • Latency: ~2ms per 512-sample window (32ms audio)
  • Throughput: ~500x real-time on CPU
  • Memory: ~45 MB

See Also

Transcription API

Speech-to-text with VAD

Python SDK

Python VAD API

Transcription Guide

Integrate VAD with ASR

Build docs developers (and LLMs) love