Documentation Index Fetch the complete documentation index at: https://mintlify.com/ggml-org/ggml/llms.txt
Use this file to discover all available pages before exploring further.
All operations take a struct ggml_context * as their first argument and return a struct ggml_tensor * representing the result. Operations do not perform any computation — they record a node in the computation graph. Computation only happens when ggml_graph_compute() or ggml_graph_compute_with_ctx() is called.
Most operations have an _inplace variant that writes results back into the first tensor operand, returning a view of it.
ggml_addstruct ggml_tensor * ggml_add (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b );
Element-wise addition a + b. b is broadcast to the shape of a when necessary. ggml_add1struct ggml_tensor * ggml_add1 (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b );
Adds the scalar value held in tensor b to every element of a. ggml_substruct ggml_tensor * ggml_sub (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b );
Element-wise subtraction a - b. ggml_mulstruct ggml_tensor * ggml_mul (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b );
Element-wise multiplication a * b (Hadamard product). b is broadcast to the shape of a. ggml_divstruct ggml_tensor * ggml_div (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b );
Element-wise division a / b. ggml_sqrstruct ggml_tensor * ggml_sqr (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Element-wise square a². ggml_sqrtstruct ggml_tensor * ggml_sqrt (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Element-wise square root √a. ggml_absstruct ggml_tensor * ggml_abs (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Element-wise absolute value |a|. ggml_negstruct ggml_tensor * ggml_neg (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Element-wise negation -a. ggml_logstruct ggml_tensor * ggml_log (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Element-wise natural logarithm ln(a). ggml_expstruct ggml_tensor * ggml_exp (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Element-wise exponential eᵃ. ggml_sin / ggml_cosstruct ggml_tensor * ggml_sin ( struct ggml_context * ctx , struct ggml_tensor * a );
struct ggml_tensor * ggml_cos ( struct ggml_context * ctx , struct ggml_tensor * a );
Element-wise trigonometric functions. ggml_scalestruct ggml_tensor * ggml_scale (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
float s );
Multiplies every element of a by the scalar s. Equivalent to a * s. ggml_clampstruct ggml_tensor * ggml_clamp (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
float min ,
float max );
Clamps every element of a to [min, max]. Operates in-place and returns a view of a.
ggml_mul_matstruct ggml_tensor * ggml_mul_mat (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b );
Matrix multiplication. a is the weight matrix (k columns, n rows) and b is the input (k columns, m rows — transposed internally). The result is n columns by m rows.
a: [ne03, ne02, n, k]
b: [ne03*x, ne02*y, m, k]
result: [ne03*x, ne02*y, m, n]
a may be quantized; b must be F32 or F16.ggml_mul_mat_set_precvoid ggml_mul_mat_set_prec (
struct ggml_tensor * a ,
enum ggml_prec prec );
Overrides the accumulation precision of a ggml_mul_mat result tensor. Set to GGML_PREC_F32 for higher-precision accumulation (useful for models like Phi-2). ggml_mul_mat_idstruct ggml_tensor * ggml_mul_mat_id (
struct ggml_context * ctx ,
struct ggml_tensor * as ,
struct ggml_tensor * b ,
struct ggml_tensor * ids );
Indirect matrix multiplication. Selects one of the weight matrices from as using the row indices in ids, then multiplies by b. Used in mixture-of-experts routing. ggml_out_prodstruct ggml_tensor * ggml_out_prod (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b );
Outer product. a is [m, n], b is [p, n], result is [m, p].
ggml_relustruct ggml_tensor * ggml_relu (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Rectified linear unit: max(0, a) element-wise. ggml_leaky_relustruct ggml_tensor * ggml_leaky_relu (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
float negative_slope ,
bool inplace );
Leaky ReLU: a >= 0 ? a : negative_slope * a. ggml_gelustruct ggml_tensor * ggml_gelu (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Gaussian Error Linear Unit. Uses the standard approximation based on tanh. ggml_gelu_erfstruct ggml_tensor * ggml_gelu_erf (
struct ggml_context * ctx ,
struct ggml_tensor * a );
GELU computed using the error function (erf) when available. Some backends may fall back to the Abramowitz and Stegun approximation. ggml_gelu_quickstruct ggml_tensor * ggml_gelu_quick (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Faster GELU approximation. ggml_silustruct ggml_tensor * ggml_silu (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Sigmoid Linear Unit: a * sigmoid(a). ggml_silu_backstruct ggml_tensor * ggml_silu_back (
struct ggml_context * ctx ,
struct ggml_tensor * a , // x (forward input)
struct ggml_tensor * b ); // dy (upstream gradient)
Backward pass of SiLU. Returns dx given x and dy. ggml_sigmoidstruct ggml_tensor * ggml_sigmoid (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Logistic sigmoid: 1 / (1 + exp(-a)). ggml_tanhstruct ggml_tensor * ggml_tanh (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Hyperbolic tangent. ggml_elustruct ggml_tensor * ggml_elu (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Exponential Linear Unit: a >= 0 ? a : exp(a) - 1. ggml_hardswish / ggml_hardsigmoidstruct ggml_tensor * ggml_hardswish ( struct ggml_context * ctx , struct ggml_tensor * a );
struct ggml_tensor * ggml_hardsigmoid ( struct ggml_context * ctx , struct ggml_tensor * a );
hardswish(x) = x * relu6(x + 3) / 6
hardsigmoid(x) = relu6(x + 3) / 6
Gated linear units ggml provides fused GLU variants that split or gate the activation in a single op: // Single-tensor GLU (gate in second half of row)
struct ggml_tensor * ggml_reglu ( struct ggml_context * ctx , struct ggml_tensor * a );
struct ggml_tensor * ggml_geglu ( struct ggml_context * ctx , struct ggml_tensor * a );
struct ggml_tensor * ggml_swiglu ( struct ggml_context * ctx , struct ggml_tensor * a );
// Split-tensor GLU (separate tensors for input and gate)
struct ggml_tensor * ggml_reglu_split (
struct ggml_context * ctx , struct ggml_tensor * a , struct ggml_tensor * b );
struct ggml_tensor * ggml_geglu_split (
struct ggml_context * ctx , struct ggml_tensor * a , struct ggml_tensor * b );
struct ggml_tensor * ggml_swiglu_split (
struct ggml_context * ctx , struct ggml_tensor * a , struct ggml_tensor * b );
ggml_normstruct ggml_tensor * ggml_norm (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
float eps );
Layer normalization along rows. Subtracts the row mean and divides by the row standard deviation. eps is added to the variance before taking the square root for numerical stability. ggml_rms_normstruct ggml_tensor * ggml_rms_norm (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
float eps );
Root mean square normalization along rows. Divides each row by its RMS. Commonly used in LLaMA-style transformers. ggml_l2_normstruct ggml_tensor * ggml_l2_norm (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
float eps );
L2 normalization along rows. Divides each row by its L2 norm. Used in RWKV v7. ggml_group_normstruct ggml_tensor * ggml_group_norm (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
int n_groups ,
float eps );
Group normalization along ne0 * ne1 / n_groups channels. Commonly used in image models such as Stable Diffusion. Number of channel groups to normalize over.
Small constant added to the variance for numerical stability.
ggml_flash_attn_extstruct ggml_tensor * ggml_flash_attn_ext (
struct ggml_context * ctx ,
struct ggml_tensor * q ,
struct ggml_tensor * k ,
struct ggml_tensor * v ,
struct ggml_tensor * mask ,
float scale ,
float max_bias ,
float logit_softcap );
Fused scaled-dot-product attention with optional ALiBi bias and logit soft-capping. This is the primary attention kernel used by llama.cpp and related projects. Tensor layout:
q: [n_embd_k, n_batch, n_head, ne3]
k: [n_embd_k, n_kv, n_head_kv, ne3]
v: [n_embd_v, n_kv, n_head_kv, ne3] — not pre-transposed
mask: [n_kv, n_batch, ne32, ne33] — F16 or F32, optional
result: [n_embd_v, n_head, n_batch, ne3] — permuted
Attention scaling factor applied before softmax. Typically 1/sqrt(head_dim).
Maximum ALiBi slope. Set to 0.0 to disable ALiBi bias.
Soft-cap applied to logits as tanh(logit / cap) * cap. Set to 0.0 to disable.
void ggml_flash_attn_ext_set_prec (
struct ggml_tensor * a ,
enum ggml_prec prec );
Overrides the precision of the flash attention accumulation (e.g. GGML_PREC_F32). ggml_soft_max_extstruct ggml_tensor * ggml_soft_max_ext (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * mask ,
float scale ,
float max_bias );
Fused softmax with optional attention mask and ALiBi bias. Computes softmax(a * scale + mask * alibi_slope).
ggml_reshape_1d / _2d / _3d / _4dstruct ggml_tensor * ggml_reshape_1d (
struct ggml_context * ctx , struct ggml_tensor * a , int64_t ne0 );
struct ggml_tensor * ggml_reshape_2d (
struct ggml_context * ctx , struct ggml_tensor * a ,
int64_t ne0 , int64_t ne1 );
struct ggml_tensor * ggml_reshape_3d (
struct ggml_context * ctx , struct ggml_tensor * a ,
int64_t ne0 , int64_t ne1 , int64_t ne2 );
struct ggml_tensor * ggml_reshape_4d (
struct ggml_context * ctx , struct ggml_tensor * a ,
int64_t ne0 , int64_t ne1 , int64_t ne2 , int64_t ne3 );
Returns a view of a with the specified shape. Total element count must match. a must be contiguous. ggml_view_1d / _2d / _3d / _4dstruct ggml_tensor * ggml_view_1d (
struct ggml_context * ctx , struct ggml_tensor * a ,
int64_t ne0 , size_t offset );
struct ggml_tensor * ggml_view_2d (
struct ggml_context * ctx , struct ggml_tensor * a ,
int64_t ne0 , int64_t ne1 ,
size_t nb1 , // row stride in bytes
size_t offset );
struct ggml_tensor * ggml_view_3d (
struct ggml_context * ctx , struct ggml_tensor * a ,
int64_t ne0 , int64_t ne1 , int64_t ne2 ,
size_t nb1 , size_t nb2 , size_t offset );
struct ggml_tensor * ggml_view_4d (
struct ggml_context * ctx , struct ggml_tensor * a ,
int64_t ne0 , int64_t ne1 , int64_t ne2 , int64_t ne3 ,
size_t nb1 , size_t nb2 , size_t nb3 , size_t offset );
Creates a view into a starting at offset bytes. Strides can differ from a, enabling sub-matrix and strided views without copying. ggml_transposestruct ggml_tensor * ggml_transpose (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Swaps the first two dimensions of a. Equivalent to ggml_permute(ctx, a, 1, 0, 2, 3). Returns a view; no data is copied. ggml_permutestruct ggml_tensor * ggml_permute (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
int axis0 , int axis1 , int axis2 , int axis3 );
Arbitrarily reorders the four axes of a. For example, ggml_permute(ctx, a, 2, 1, 0, 3) moves dimension 2 to position 0. Returns a non-contiguous view; no data is copied. ggml_contstruct ggml_tensor * ggml_cont ( struct ggml_context * ctx , struct ggml_tensor * a );
Makes a contiguous copy of a if it is not already contiguous. Variants ggml_cont_1d through ggml_cont_4d also reshape while making contiguous.
ggml_sumstruct ggml_tensor * ggml_sum (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Reduces all elements to a scalar by summing. ggml_sum_rowsstruct ggml_tensor * ggml_sum_rows (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Sums along dimension 0 (rows). Input shape [a, b, c, d] → output shape [1, b, c, d]. ggml_meanstruct ggml_tensor * ggml_mean (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Computes the mean along rows. ggml_argmaxstruct ggml_tensor * ggml_argmax (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Returns the index of the maximum element along each row. ggml_top_kstruct ggml_tensor * ggml_top_k (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
int k );
Returns the top-k elements per row. The returned indices are not in sorted order. Use ggml_argsort if you need fully sorted rows.
ggml_argsortstruct ggml_tensor * ggml_argsort (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
enum ggml_sort_order order ); // GGML_SORT_ORDER_ASC or GGML_SORT_ORDER_DESC
Returns the indices that would sort each row in the given order. ggml_cumsumstruct ggml_tensor * ggml_cumsum (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Cumulative sum along the row dimension.
ggml_conv_1dstruct ggml_tensor * ggml_conv_1d (
struct ggml_context * ctx ,
struct ggml_tensor * a , // convolution kernel
struct ggml_tensor * b , // input data
int s0 , // stride
int p0 , // padding
int d0 ); // dilation
1D convolution of data b with kernel a. a
struct ggml_tensor *
required
Convolution kernel tensor.
b
struct ggml_tensor *
required
Input data tensor.
Stride along dimension 0.
Padding along dimension 0.
Dilation along dimension 0.
ggml_conv_2dstruct ggml_tensor * ggml_conv_2d (
struct ggml_context * ctx ,
struct ggml_tensor * a , // convolution kernel
struct ggml_tensor * b , // input data
int s0 , // stride dimension 0
int s1 , // stride dimension 1
int p0 , // padding dimension 0
int p1 , // padding dimension 1
int d0 , // dilation dimension 0
int d1 ); // dilation dimension 1
2D convolution. Implemented via ggml_im2col + ggml_mul_mat.
Embedding and positional encoding
ggml_get_rowsstruct ggml_tensor * ggml_get_rows (
struct ggml_context * ctx ,
struct ggml_tensor * a , // data [n_embd, ne1, ne2, ne3]
struct ggml_tensor * b ); // row indices (I32) [n_rows, ne2, ne3, 1]
Gathers rows from a by the integer indices stored in b. Used for token embedding lookup. Result shape: [n_embd, n_rows, ne2, ne3]. ggml_ropestruct ggml_tensor * ggml_rope (
struct ggml_context * ctx ,
struct ggml_tensor * a , // query or key tensor
struct ggml_tensor * b , // position indices (I32), size == a->ne[2]
int n_dims , // number of dimensions to rotate
int mode ); // GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, etc.
Applies Rotary Position Embedding (RoPE) to a. b is a 1D tensor of position indices. ggml_rope_extstruct ggml_tensor * ggml_rope_ext (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b , // position indices
struct ggml_tensor * c , // frequency factors (optional, e.g. Phi3-128k)
int n_dims ,
int mode ,
int n_ctx_orig , // original context length for YaRN scaling
float freq_base ,
float freq_scale ,
float ext_factor ,
float attn_factor ,
float beta_fast ,
float beta_slow );
Extended RoPE with support for YaRN-style context extension and custom frequency scaling. Use this instead of the deprecated ggml_rope_custom. Optional per-dimension frequency scaling factors. Pass NULL to use default RoPE frequencies.
Original training context length. Used to compute YaRN correction dimensions.
Base frequency for the sinusoidal position encoding (e.g. 10000.0).
YaRN extrapolation factor. Set to 0.0 to disable YaRN.
ggml_cross_entropy_lossstruct ggml_tensor * ggml_cross_entropy_loss (
struct ggml_context * ctx ,
struct ggml_tensor * a , // logits
struct ggml_tensor * b ); // labels
Computes cross-entropy loss between logits a and ground-truth labels b. The result is a scalar tensor. Mark it with ggml_set_loss() to use it as the optimization objective.
Concatenation and repetition
ggml_concatstruct ggml_tensor * ggml_concat (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b ,
int dim );
Concatenates a and b along dimension dim. ggml_repeatstruct ggml_tensor * ggml_repeat (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
struct ggml_tensor * b );
Repeats (tiles) a to match the shape of b. If a already has the same shape as b and is not a parameter tensor, returns a directly. ggml_repeat_4dstruct ggml_tensor * ggml_repeat_4d (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
int64_t ne0 , int64_t ne1 , int64_t ne2 , int64_t ne3 );
Repeats a to an explicit 4D target shape.
ggml_diagstruct ggml_tensor * ggml_diag (
struct ggml_context * ctx ,
struct ggml_tensor * a );
Constructs a diagonal matrix from vector a. ggml_diag_mask_infstruct ggml_tensor * ggml_diag_mask_inf (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
int n_past );
Sets elements above the diagonal to -INF. Used to implement causal attention masks. Number of past tokens. Columns at or before n_past are not masked.
ggml_diag_mask_zerostruct ggml_tensor * ggml_diag_mask_zero (
struct ggml_context * ctx ,
struct ggml_tensor * a ,
int n_past );
Sets elements above the diagonal to 0.