DMA Transfers - PSL1GHT

Overview

SPUs cannot directly access main memory. All data transfer between main memory and SPU local store must be performed via DMA (Direct Memory Access) using the Memory Flow Controller (MFC).

The PSL1GHT spu_dma.h library provides high-level wrappers with automatic alignment checking and error detection over the raw MFC intrinsics.

DMA Fundamentals

Memory Flow Controller (MFC)

Each SPU has a dedicated MFC that handles:

DMA transfers between local store and main memory
DMA transfers between SPUs
Atomic operations
Synchronization primitives

Maximum Transfer Size

16 KB (16,384 bytes) per single DMA command

Alignment Requirement

Both source and destination must be 16-byte aligned

DMA Tags

32 tags (0-31) for tracking multiple concurrent transfers

Queue Depth

16 outstanding DMA commands per SPU

DMA Transfer Types

GET
PUT
GETB
PUTF

Transfer data from main memory to local store

mfc_get(ls_addr, ea, size, tag, tid, rid);

Transfer data from local store to main memory

mfc_put(ls_addr, ea, size, tag, tid, rid);

GET with barrier - waits for all previous commands to complete

mfc_getb(ls_addr, ea, size, tag, tid, rid);

PUT with fence - waits for all previous commands on same tag

mfc_putf(ls_addr, ea, size, tag, tid, rid);

Basic DMA Operations

Simple GET Transfer

#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include <dma/spu_dma.h>

#define TAG 1

int main(uint64_t ea, uint64_t outptr, 
         uint64_t arg3, uint64_t arg4) {
    
    // Aligned local store buffer
    vec_uchar16 buffer __attribute__((aligned(16)));
    
    // Fetch 16 bytes from main memory
    mfc_get(&buffer, ea, 16, TAG, 0, 0);
    
    // Wait for DMA completion
    mfc_write_tag_mask(1 << TAG);
    mfc_read_tag_status_all();
    
    // Now buffer contains the data
    // Process it...
    
    return 0;
}

Simple PUT Transfer

SPU Side - Write Results Back

vec_uint4 result __attribute__((aligned(16)));

// Compute result
result = (vec_uint4){ 1, 2, 3, 4 };

// Send back to main memory
mfc_put(&result, outptr, 16, TAG, 0, 0);

// Wait for completion
mfc_write_tag_mask(1 << TAG);
mfc_read_tag_status_all();

High-Level DMA Wrappers

The dma/spu_dma.h header provides safe wrappers with automatic checking:

Safe DMA Functions

#include <dma/spu_dma.h>

// Normal transfers (16-byte aligned, size multiple of 16)
spu_dma_get(ls, ea, size, tag, 0, 0);
spu_dma_put(ls, ea, size, tag, 0, 0);

// Barrier/fence variants
spu_dma_getb(ls, ea, size, tag, 0, 0);  // GET with barrier
spu_dma_putf(ls, ea, size, tag, 0, 0);  // PUT with fence

// Small transfers (1, 2, 4, or 8 bytes)
spu_dma_small_get(ls, ea, 4, tag, 0, 0);
spu_dma_small_put(ls, ea, 4, tag, 0, 0);

// Large transfers (> 16 KB, automatically chunked)
spu_dma_large_get(ls, ea, 32768, tag, 0, 0);
spu_dma_large_put(ls, ea, 32768, tag, 0, 0);

The high-level wrappers include assertions that will halt the SPU if alignment or size constraints are violated. In production, define NO_SPU_DMA_ASSERT to disable checks.

Complete DMA Example

String processing with DMA transfers:

#include <stdio.h>
#include <string.h>
#include <sys/spu.h>

#include "spu_bin.h"

#define ptr2ea(x) ((u64)((void*)(x)))

static volatile u32 spu_result __attribute__((aligned(128))) = 0;
static char spu_text[] __attribute__((aligned(128))) = "abCdefGhIJklMnOP";

int main(int argc, char *argv[]) {
    sysSpuImage image;
    u32 thread_id, group_id;
    sysSpuThreadAttribute attr = { ptr2ea("mythread"), 9, SPU_THREAD_ATTR_NONE };
    sysSpuThreadGroupAttribute grpattr = { 8, ptr2ea("mygroup"), 0, {0} };
    sysSpuThreadArgument arg;
    
    printf("Input text: %s\n", spu_text);
    
    // Initialize SPU
    sysSpuInitialize(6, 0);
    sysSpuImageImport(&image, spu_bin, 0);
    sysSpuThreadGroupCreate(&group_id, 1, 100, &grpattr);
    
    // Pass addresses to SPU
    arg.arg0 = ptr2ea(spu_text);
    arg.arg1 = ptr2ea(&spu_result);
    arg.arg2 = 0;
    arg.arg3 = 0;
    
    sysSpuThreadInitialize(&thread_id, group_id, 0, 
                          &image, &attr, &arg);
    
    // Start execution
    sysSpuThreadGroupStart(group_id);
    
    // Wait for completion (busy wait on result flag)
    while (spu_result == 0);
    
    u32 cause, status;
    sysSpuThreadGroupJoin(group_id, &cause, &status);
    sysSpuImageClose(&image);
    
    printf("Output text: %s\n", spu_text);
    
    return 0;
}

DMA Tag Management

Tags allow tracking multiple concurrent DMA transfers:

Multiple Concurrent Transfers

#define TAG_INPUT  1
#define TAG_OUTPUT 2
#define TAG_TEMP   3

// Issue multiple DMA operations
mfc_get(input_buffer, input_ea, 1024, TAG_INPUT, 0, 0);
mfc_get(temp_buffer, temp_ea, 512, TAG_TEMP, 0, 0);

// Wait for specific tag
mfc_write_tag_mask(1 << TAG_INPUT);
mfc_read_tag_status_all();  // Wait for TAG_INPUT only

// Process input_buffer while temp_buffer loads...

// Wait for all tags
mfc_write_tag_mask((1 << TAG_INPUT) | (1 << TAG_TEMP) | (1 << TAG_OUTPUT));
mfc_read_tag_status_all();

Tag Wait Modes

ALL
ANY
IMMEDIATE

Wait for all specified tags to complete

mfc_write_tag_mask(0x07);  // Tags 0, 1, 2
mfc_read_tag_status_all(); // Waits for all 3

Wait for any one of the specified tags

mfc_write_tag_mask(0x07);
uint32_t completed = mfc_read_tag_status_any();
// Returns mask of completed tags

Check status without waiting

mfc_write_tag_mask(0x01);
uint32_t status = mfc_read_tag_status_immediate();
if (status & 0x01) {
    // Tag 0 is complete
}

Double Buffering

Overlap computation with data transfer for maximum performance:

Double Buffer Pattern

#define TAG_A 1
#define TAG_B 2
#define BLOCK_SIZE 4096

uint8_t buffer_a[BLOCK_SIZE] __attribute__((aligned(128)));
uint8_t buffer_b[BLOCK_SIZE] __attribute__((aligned(128)));

uint64_t ea_offset = 0;
int current = 0;

// Initial load into buffer A
mfc_get(buffer_a, input_ea, BLOCK_SIZE, TAG_A, 0, 0);
mfc_write_tag_mask(1 << TAG_A);
mfc_read_tag_status_all();

for (int i = 1; i < num_blocks; i++) {
    if (current == 0) {
        // Start loading next block into B while processing A
        mfc_get(buffer_b, input_ea + i * BLOCK_SIZE, 
                BLOCK_SIZE, TAG_B, 0, 0);
        
        // Process buffer A
        process_data(buffer_a, BLOCK_SIZE);
        
        // Wait for B to finish loading
        mfc_write_tag_mask(1 << TAG_B);
        mfc_read_tag_status_all();
        
        current = 1;
    } else {
        // Start loading next block into A while processing B
        mfc_get(buffer_a, input_ea + i * BLOCK_SIZE, 
                BLOCK_SIZE, TAG_A, 0, 0);
        
        // Process buffer B
        process_data(buffer_b, BLOCK_SIZE);
        
        // Wait for A to finish loading
        mfc_write_tag_mask(1 << TAG_A);
        mfc_read_tag_status_all();
        
        current = 0;
    }
}

// Process final buffer
if (current == 1) {
    process_data(buffer_b, BLOCK_SIZE);
}

List DMA

Transfer multiple non-contiguous memory regions in a single operation:

DMA List Transfer

#include <dma/spu_dma.h>

// List element structure
typedef struct {
    uint32_t size;     // Transfer size
    uint32_t ea_low;   // Low 32 bits of effective address
} __attribute__((aligned(8))) dma_list_element;

dma_list_element list[8] __attribute__((aligned(8)));
uint8_t buffer[16384] __attribute__((aligned(16)));

// Setup list to transfer 8 scattered regions
uint64_t base_ea = input_ea;
for (int i = 0; i < 8; i++) {
    list[i].size = 1024;
    list[i].ea_low = (base_ea + i * 2048) & 0xFFFFFFFF;
}

// Perform list DMA (transfers all 8 regions)
spu_dma_list_get(buffer, base_ea >> 32, list, 
                 sizeof(list), TAG, 0, 0);

mfc_write_tag_mask(1 << TAG);
mfc_read_tag_status_all();

Atomic Operations

Safe read-modify-write operations on shared memory:

Atomic Compare and Swap

uint8_t ls_buffer[128] __attribute__((aligned(128)));
uint64_t ea = shared_counter_ea;  // Must be 128-byte aligned

// Get lock line atomically
mfc_getllar(ls_buffer, ea, 0, 0);
mfc_read_atomic_status();

// Modify value
uint32_t *counter = (uint32_t*)ls_buffer;
(*counter)++;

// Conditionally put back (fails if another SPU modified it)
mfc_putllc(ls_buffer, ea, 0, 0);
if (mfc_read_atomic_status() == 0) {
    // Another SPU modified it, retry...
} else {
    // Success!
}

Small Data Transfers

For transferring individual values (1, 2, 4, or 8 bytes):

Typed DMA Transfers

#include <dma/spu_dma.h>

#define TAG 1

// Write single values
spu_dma_put_uint32(0x12345678, counter_ea, TAG, 0, 0);
spu_dma_put_uint64(0xDEADBEEFCAFEBABE, timestamp_ea, TAG, 0, 0);

// Read single values
uint32_t value32 = spu_dma_get_uint32(status_ea, TAG, 0, 0);
uint64_t value64 = spu_dma_get_uint64(result_ea, TAG, 0, 0);

Small transfers have alignment requirements based on size:

1-byte: no alignment required
2-byte: 2-byte aligned
4-byte: 4-byte aligned
8-byte: 8-byte aligned

Local store and effective addresses must have the same low 4 bits.

Large Transfers

Automatically chunk transfers larger than 16 KB:

Large Transfer Example

#include <dma/spu_dma.h>

#define TAG 1

uint8_t large_buffer[65536] __attribute__((aligned(128)));

// Transfer 64 KB (automatically split into multiple 16KB DMAs)
spu_dma_large_get(large_buffer, input_ea, 65536, TAG, 0, 0);

// Wait for all chunks to complete
mfc_write_tag_mask(1 << TAG);
mfc_read_tag_status_all();

DMA and Cache Coherency

Important: DMA transfers bypass the PPU cache. Always use aligned buffers and ensure proper memory barriers.

PPU Side - Proper Alignment

#include <malloc.h>

// Always use memalign for DMA buffers
// Prefer 128-byte alignment for best performance
uint8_t *buffer = memalign(128, data_size);

if (!buffer) {
    printf("Allocation failed\n");
    return -1;
}

// Ensure data is written before SPU accesses it
__sync_synchronize();  // Memory barrier

// Start SPU thread...

// Wait for SPU completion...

// Ensure SPU writes are visible
__sync_synchronize();

free(buffer);

Common DMA Patterns

Scatter-Gather

Process Multiple Regions

// SPU processes multiple scattered input regions
for (int i = 0; i < num_regions; i++) {
    uint64_t ea = base_ea + regions[i].offset;
    uint32_t size = regions[i].size;
    
    mfc_get(buffer, ea, size, TAG, 0, 0);
    mfc_write_tag_mask(1 << TAG);
    mfc_read_tag_status_all();
    
    process_data(buffer, size);
    
    mfc_put(buffer, output_ea + i * size, size, TAG, 0, 0);
}

Pipeline Pattern

Three-Stage Pipeline

#define TAG_LOAD   1
#define TAG_STORE  2

uint8_t buf_in[BLOCK_SIZE]  __attribute__((aligned(128)));
uint8_t buf_out[BLOCK_SIZE] __attribute__((aligned(128)));

for (int i = 0; i < num_blocks; i++) {
    // Stage 1: Load next block
    mfc_get(buf_in, input_ea + i * BLOCK_SIZE, 
            BLOCK_SIZE, TAG_LOAD, 0, 0);
    
    // Stage 2: Wait for load
    mfc_write_tag_mask(1 << TAG_LOAD);
    mfc_read_tag_status_all();
    
    // Stage 3: Process
    process_block(buf_in, buf_out, BLOCK_SIZE);
    
    // Stage 4: Store result
    mfc_put(buf_out, output_ea + i * BLOCK_SIZE, 
            BLOCK_SIZE, TAG_STORE, 0, 0);
}

// Wait for final store
mfc_write_tag_mask(1 << TAG_STORE);
mfc_read_tag_status_all();

Debugging DMA Issues

DMA Alignment Errors

Symptoms: SPU stops or hangsCauses:

Local store or effective address not 16-byte aligned
Transfer size not multiple of 16 bytes

Solution:

// Enable verbose assertions during development
#define SPU_DMA_ASSERT_VERBOSE
#include <dma/spu_dma.h>

// Use aligned attribute
uint8_t buffer[1024] __attribute__((aligned(128)));

DMA Queue Overflow

Symptoms: DMA commands appear to be ignoredCause: More than 16 outstanding DMA commandsSolution:

// Wait for some DMAs before issuing more
mfc_write_tag_mask(0xFFFF);  // All tags
mfc_read_tag_status_any();   // Wait for any to complete

Cache Coherency Issues

Symptoms: Stale or corrupted dataCause: PPU cache not synchronized with DMASolution:

// PPU side - use memory barriers
__sync_synchronize();

// Or use memalign which returns uncached memory
void *buffer = memalign(128, size);

Performance Tips

Maximize Transfer Size

Use the full 16 KB per DMA command when possible

Double Buffer

Overlap DMA transfers with computation

Minimize Waits

Issue multiple DMAs before waiting

128-byte Alignment

Use 128-byte alignment for best cache performance

mfc_get - Basic DMA get (spu_mfcio.h)
mfc_put - Basic DMA put (spu_mfcio.h)
spu_dma_get - Safe DMA get wrapper (dma/spu_dma.h:99)
spu_dma_put - Safe DMA put wrapper (dma/spu_dma.h:78)
spu_dma_large_get - Large transfer get (dma/spu_dma.h:316)

Getting Started

Core Concepts

Graphics

Input

Audio

Networking

System

System Utilities

SPU Development

Tools

Advanced Topics

Documentation Index

​Overview

​DMA Fundamentals

​Memory Flow Controller (MFC)

Maximum Transfer Size

Alignment Requirement

DMA Tags

Queue Depth

​DMA Transfer Types

​Basic DMA Operations

​Simple GET Transfer

​Simple PUT Transfer

​High-Level DMA Wrappers

​Complete DMA Example

​DMA Tag Management

​Tag Wait Modes

​Double Buffering

​List DMA

​Atomic Operations

​Small Data Transfers

​Large Transfers

​DMA and Cache Coherency

​Common DMA Patterns

​Scatter-Gather

​Pipeline Pattern

​Debugging DMA Issues

​Performance Tips

Maximize Transfer Size

Double Buffer

Minimize Waits

128-byte Alignment

​Related Functions

​Next Steps

Thread Management

SPURS Framework

Build docs developers (and LLMs) love

Overview

DMA Fundamentals

Memory Flow Controller (MFC)

DMA Transfer Types

Basic DMA Operations

Simple GET Transfer

Simple PUT Transfer

High-Level DMA Wrappers

Complete DMA Example

DMA Tag Management

Tag Wait Modes

Double Buffering

List DMA

Atomic Operations

Small Data Transfers

Large Transfers

DMA and Cache Coherency

Common DMA Patterns

Scatter-Gather

Pipeline Pattern

Debugging DMA Issues

Performance Tips

Related Functions

Next Steps