Overview
CactusGraph is a low-level computation graph abstraction for building neural network operations. It supports operator fusion, mixed precision, and memory pooling.
CactusGraph Class
Constructor
Create an input node.
size_t input (
const std :: vector < size_t > & shape ,
Precision precision = Precision :: INT8
);
Tensor dimensions (e.g., )
Data type: INT8, FP16, or FP32
Node ID for connecting to other operations
Populate input node with data.
void set_input ( size_t node_id , const void* data , Precision precision );
Use external memory buffer (zero-copy).
void set_external_input ( size_t node_id , void* data , Precision precision );
Arithmetic Operations
Element-wise Binary
size_t add ( size_t input1 , size_t input2 );
size_t subtract ( size_t input1 , size_t input2 );
size_t multiply ( size_t input1 , size_t input2 );
size_t divide ( size_t input1 , size_t input2 );
Element-wise Scalar
size_t scalar_add ( size_t input , float value );
size_t scalar_multiply ( size_t input , float value );
size_t scalar_exp ( size_t input );
size_t scalar_sqrt ( size_t input );
size_t scalar_log ( size_t input );
Linear Algebra
matmul
Matrix multiplication.
size_t matmul (
size_t input1 ,
size_t input2 ,
bool pretransposed_rhs = false ,
ComputeBackend backend = ComputeBackend :: CPU
);
Whether right matrix is already transposed
backend
ComputeBackend
default: "CPU"
CPU or NPU execution
transpose
size_t transpose (
size_t input ,
ComputeBackend backend = ComputeBackend :: CPU
);
rms_norm
Root mean square normalization.
size_t rms_norm (
size_t input ,
size_t weight ,
float epsilon = 1 e- 5 f
);
rope
Rotary position embedding.
size_t rope (
size_t input ,
float theta ,
size_t position_offset = 0 ,
ComputeBackend backend = ComputeBackend :: CPU
);
Rotation frequency base (typically 10000.0 or 1000000.0)
Starting position for incremental decoding
attention
Scaled dot-product attention.
size_t attention (
size_t query ,
size_t key ,
size_t value ,
float scale ,
bool is_causal = true ,
ComputeBackend backend = ComputeBackend :: CPU
);
Attention scale factor (typically 1/sqrt(head_dim))
Apply causal mask for autoregressive generation
attention_int8_hybrid
Hybrid attention with INT8 cached KV.
size_t attention_int8_hybrid (
size_t query ,
size_t key_new ,
size_t value_new ,
float scale ,
size_t position_offset ,
const int8_t* cached_keys ,
const int8_t* cached_values ,
const float* k_scales ,
const float* v_scales ,
size_t cache_len ,
size_t num_kv_heads ,
size_t head_dim ,
size_t window_size = 0
);
Activation Functions
size_t relu ( size_t input );
size_t silu ( size_t input );
size_t gelu ( size_t input );
size_t sigmoid ( size_t input );
size_t tanh ( size_t input );
size_t glu ( size_t input , int axis = - 1 );
Shape Operations
reshape
size_t reshape ( size_t input , const std :: vector < size_t > & new_shape );
slice
size_t slice ( size_t input , int axis , size_t start , size_t length );
concat
size_t concat ( size_t input1 , size_t input2 , int axis = 0 );
Reduction Operations
size_t sum ( size_t input , int axis );
size_t mean ( size_t input , int axis );
size_t max ( size_t input , int axis );
Weight Loading
mmap_weights
Memory-map weight file.
size_t mmap_weights ( const std :: string & filename );
set_grouped_scales
Attach quantization scales for grouped INT8/INT4.
void set_grouped_scales (
size_t node_id ,
size_t group_size ,
size_t num_groups ,
void* scales_ptr
);
Execution
execute
Run computation graph.
void execute ( const std :: string & profile_file = "" );
get_output
Retrieve node output.
void* get_output ( size_t node_id );
soft_reset
Clear activations, keep weights.
hard_reset
Clear all memory.
Enums
enum class Precision {
INT8 ,
FP16 ,
FP32 ,
INT4
};
enum class ComputeBackend {
CPU ,
NPU
};
Example: Matrix Multiplication
#include "cactus/graph/graph.h"
CactusGraph graph;
// Create inputs
auto a = graph . input ({ 128 , 512 }, Precision ::FP16);
auto b = graph . input ({ 512 , 256 }, Precision ::FP16);
// Matmul operation
auto c = graph . matmul (a, b, false , ComputeBackend ::CPU);
// Set input data
std :: vector < __fp16 > a_data ( 128 * 512 );
std :: vector < __fp16 > b_data ( 512 * 256 );
graph . set_input (a, a_data . data (), Precision ::FP16);
graph . set_input (b, b_data . data (), Precision ::FP16);
// Execute
graph . execute ();
// Get result
auto * result = static_cast < __fp16 *> ( graph . get_output (c));
See Also
Model API High-level model interface
Advanced Guide Graph optimization techniques