Quickstart

Clone the repository

git clone https://github.com/ggml-org/ggml
cd ggml

Install Python dependencies (optional)

Some examples require Python tooling to download model weights. Skip this step if you only want to build the library.

python3.10 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt

Build with CMake

mkdir build && cd build
cmake ..
cmake --build . --config Release -j 8

Compiled binaries are placed in build/bin/.

Run the simple example

The simple-ctx example performs a matrix multiplication using the CPU backend.

./build/bin/simple-ctx

Expected output:

mul mat (4 x 3) (transposed result):
[  60.00  90.00  42.00
   55.00  54.00  29.00
   50.00  54.00  28.00
  110.00 126.00  64.00 ]

Working examples

The two simple examples demonstrate the two main APIs.

simple-ctx (legacy CPU API)
simple-backend (modern multi-backend API)

This example allocates a context that owns tensor data, builds a matrix multiplication graph, and executes it on the CPU.

simple-ctx.cpp

#include "ggml.h"
#include "ggml-cpu.h"

#include <cassert>
#include <cstdio>
#include <cstring>
#include <vector>

struct simple_model {
    struct ggml_tensor * a;
    struct ggml_tensor * b;
    struct ggml_context * ctx;
};

void load_model(simple_model & model, float * a, float * b,
                int rows_A, int cols_A, int rows_B, int cols_B) {
    size_t ctx_size = 0;
    ctx_size += rows_A * cols_A * ggml_type_size(GGML_TYPE_F32);
    ctx_size += rows_B * cols_B * ggml_type_size(GGML_TYPE_F32);
    ctx_size += 2 * ggml_tensor_overhead();
    ctx_size += ggml_graph_overhead();
    ctx_size += 1024;

    struct ggml_init_params params {
        /*.mem_size   =*/ ctx_size,
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ false,
    };

    model.ctx = ggml_init(params);
    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
    memcpy(model.a->data, a, ggml_nbytes(model.a));
    memcpy(model.b->data, b, ggml_nbytes(model.b));
}

struct ggml_cgraph * build_graph(const simple_model & model) {
    struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
    // result = a * b^T
    struct ggml_tensor * result = ggml_mul_mat(model.ctx, model.a, model.b);
    ggml_build_forward_expand(gf, result);
    return gf;
}

struct ggml_tensor * compute(const simple_model & model) {
    struct ggml_cgraph * gf = build_graph(model);
    ggml_graph_compute_with_ctx(model.ctx, gf, /*n_threads=*/1);
    return ggml_graph_node(gf, -1);
}

int main(void) {
    ggml_time_init();

    const int rows_A = 4, cols_A = 2;
    float matrix_A[rows_A * cols_A] = { 2, 8, 5, 1, 4, 2, 8, 6 };

    const int rows_B = 3, cols_B = 2;
    float matrix_B[rows_B * cols_B] = { 10, 5, 9, 9, 5, 4 };

    simple_model model;
    load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);

    struct ggml_tensor * result = compute(model);

    std::vector<float> out_data(ggml_nelements(result));
    memcpy(out_data.data(), result->data, ggml_nbytes(result));

    printf("mul mat (%d x %d) (transposed result):\n[",
           (int)result->ne[0], (int)result->ne[1]);
    for (int j = 0; j < result->ne[1]; j++) {
        if (j > 0) printf("\n");
        for (int i = 0; i < result->ne[0]; i++)
            printf(" %.2f", out_data[j * result->ne[0] + i]);
    }
    printf(" ]\n");

    ggml_free(model.ctx);
    return 0;
}

Key points:

ggml_init() creates a context that owns tensor memory (no_alloc = false).
ggml_new_tensor_2d() allocates a tensor inside the context.
ggml_mul_mat() records the operation in the graph — no computation yet.
ggml_graph_compute_with_ctx() executes the graph on the CPU.
ggml_free() releases the entire context and all its tensors.

This example uses the backend scheduler to automatically dispatch work to the best available device (GPU if available, otherwise CPU).

simple-backend.cpp

#include "ggml.h"
#include "ggml-backend.h"

#include <cstdio>
#include <cstring>
#include <vector>

struct simple_model {
    struct ggml_tensor * a {};
    struct ggml_tensor * b {};
    ggml_backend_t backend {};
    ggml_backend_t cpu_backend {};
    ggml_backend_sched_t sched {};
    std::vector<uint8_t> buf;
};

const int rows_A = 4, cols_A = 2;
float matrix_A[rows_A * cols_A] = { 2, 8, 5, 1, 4, 2, 8, 6 };
const int rows_B = 3, cols_B = 2;
float matrix_B[rows_B * cols_B] = { 10, 5, 9, 9, 5, 4 };

void init_model(simple_model & model) {
    ggml_backend_load_all();
    model.backend     = ggml_backend_init_best();
    model.cpu_backend = ggml_backend_init_by_type(
                            GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
    ggml_backend_t backends[2] = { model.backend, model.cpu_backend };
    model.sched = ggml_backend_sched_new(
                      backends, nullptr, 2,
                      GGML_DEFAULT_GRAPH_SIZE, false, true);
}

struct ggml_cgraph * build_graph(simple_model & model) {
    size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE
                    + ggml_graph_overhead();
    model.buf.resize(buf_size);

    struct ggml_init_params params0 = {
        /*.mem_size   =*/ buf_size,
        /*.mem_buffer =*/ model.buf.data(),
        /*.no_alloc   =*/ true, // tensors allocated later by the scheduler
    };
    struct ggml_context * ctx = ggml_init(params0);
    struct ggml_cgraph  * gf  = ggml_new_graph(ctx);

    model.a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_A, rows_A);
    model.b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_B, rows_B);
    struct ggml_tensor * result = ggml_mul_mat(ctx, model.a, model.b);
    ggml_build_forward_expand(gf, result);
    ggml_free(ctx);
    return gf;
}

struct ggml_tensor * compute(simple_model & model, struct ggml_cgraph * gf) {
    ggml_backend_sched_reset(model.sched);
    ggml_backend_sched_alloc_graph(model.sched, gf);

    // upload data from CPU memory to backend buffer
    ggml_backend_tensor_set(model.a, matrix_A, 0, ggml_nbytes(model.a));
    ggml_backend_tensor_set(model.b, matrix_B, 0, ggml_nbytes(model.b));

    ggml_backend_sched_graph_compute(model.sched, gf);
    return ggml_graph_node(gf, -1);
}

int main(void) {
    ggml_time_init();
    simple_model model;
    init_model(model);
    struct ggml_cgraph * gf = build_graph(model);
    struct ggml_tensor * result = compute(model, gf);

    std::vector<float> out_data(ggml_nelements(result));
    ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));

    printf("mul mat (%d x %d) (transposed result):\n[",
           (int)result->ne[0], (int)result->ne[1]);
    for (int j = 0; j < result->ne[1]; j++) {
        if (j > 0) printf("\n");
        for (int i = 0; i < result->ne[0]; i++)
            printf(" %.2f", out_data[j * result->ne[0] + i]);
    }
    printf(" ]\n");

    ggml_backend_sched_free(model.sched);
    ggml_backend_free(model.backend);
    ggml_backend_free(model.cpu_backend);
    return 0;
}

Key differences from simple-ctx:

ggml_backend_load_all() discovers all compiled backends at startup.
ggml_backend_init_best() picks the highest-priority available device.
The context is created with no_alloc = true; the scheduler allocates tensors on the appropriate device.
ggml_backend_tensor_set/get transfer data between CPU and device memory.

Get Started

Core Concepts

Backends

Training

File Formats

Examples

Working examples

Build docs developers (and LLMs) love

Get Started

Core Concepts

Backends

Training

File Formats

Examples

Documentation Index

​Working examples

Build docs developers (and LLMs) love

Working examples