Skip to main content

Overview

SPUs cannot directly access main memory. All data transfer between main memory and SPU local store must be performed via DMA (Direct Memory Access) using the Memory Flow Controller (MFC).
The PSL1GHT spu_dma.h library provides high-level wrappers with automatic alignment checking and error detection over the raw MFC intrinsics.

DMA Fundamentals

Memory Flow Controller (MFC)

Each SPU has a dedicated MFC that handles:
  • DMA transfers between local store and main memory
  • DMA transfers between SPUs
  • Atomic operations
  • Synchronization primitives

Maximum Transfer Size

16 KB (16,384 bytes) per single DMA command

Alignment Requirement

Both source and destination must be 16-byte aligned

DMA Tags

32 tags (0-31) for tracking multiple concurrent transfers

Queue Depth

16 outstanding DMA commands per SPU

DMA Transfer Types

Transfer data from main memory to local store
mfc_get(ls_addr, ea, size, tag, tid, rid);

Basic DMA Operations

Simple GET Transfer

#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include <dma/spu_dma.h>

#define TAG 1

int main(uint64_t ea, uint64_t outptr, 
         uint64_t arg3, uint64_t arg4) {
    
    // Aligned local store buffer
    vec_uchar16 buffer __attribute__((aligned(16)));
    
    // Fetch 16 bytes from main memory
    mfc_get(&buffer, ea, 16, TAG, 0, 0);
    
    // Wait for DMA completion
    mfc_write_tag_mask(1 << TAG);
    mfc_read_tag_status_all();
    
    // Now buffer contains the data
    // Process it...
    
    return 0;
}

Simple PUT Transfer

SPU Side - Write Results Back
vec_uint4 result __attribute__((aligned(16)));

// Compute result
result = (vec_uint4){ 1, 2, 3, 4 };

// Send back to main memory
mfc_put(&result, outptr, 16, TAG, 0, 0);

// Wait for completion
mfc_write_tag_mask(1 << TAG);
mfc_read_tag_status_all();

High-Level DMA Wrappers

The dma/spu_dma.h header provides safe wrappers with automatic checking:
Safe DMA Functions
#include <dma/spu_dma.h>

// Normal transfers (16-byte aligned, size multiple of 16)
spu_dma_get(ls, ea, size, tag, 0, 0);
spu_dma_put(ls, ea, size, tag, 0, 0);

// Barrier/fence variants
spu_dma_getb(ls, ea, size, tag, 0, 0);  // GET with barrier
spu_dma_putf(ls, ea, size, tag, 0, 0);  // PUT with fence

// Small transfers (1, 2, 4, or 8 bytes)
spu_dma_small_get(ls, ea, 4, tag, 0, 0);
spu_dma_small_put(ls, ea, 4, tag, 0, 0);

// Large transfers (> 16 KB, automatically chunked)
spu_dma_large_get(ls, ea, 32768, tag, 0, 0);
spu_dma_large_put(ls, ea, 32768, tag, 0, 0);
The high-level wrappers include assertions that will halt the SPU if alignment or size constraints are violated. In production, define NO_SPU_DMA_ASSERT to disable checks.

Complete DMA Example

String processing with DMA transfers:
#include <stdio.h>
#include <string.h>
#include <sys/spu.h>

#include "spu_bin.h"

#define ptr2ea(x) ((u64)((void*)(x)))

static volatile u32 spu_result __attribute__((aligned(128))) = 0;
static char spu_text[] __attribute__((aligned(128))) = "abCdefGhIJklMnOP";

int main(int argc, char *argv[]) {
    sysSpuImage image;
    u32 thread_id, group_id;
    sysSpuThreadAttribute attr = { ptr2ea("mythread"), 9, SPU_THREAD_ATTR_NONE };
    sysSpuThreadGroupAttribute grpattr = { 8, ptr2ea("mygroup"), 0, {0} };
    sysSpuThreadArgument arg;
    
    printf("Input text: %s\n", spu_text);
    
    // Initialize SPU
    sysSpuInitialize(6, 0);
    sysSpuImageImport(&image, spu_bin, 0);
    sysSpuThreadGroupCreate(&group_id, 1, 100, &grpattr);
    
    // Pass addresses to SPU
    arg.arg0 = ptr2ea(spu_text);
    arg.arg1 = ptr2ea(&spu_result);
    arg.arg2 = 0;
    arg.arg3 = 0;
    
    sysSpuThreadInitialize(&thread_id, group_id, 0, 
                          &image, &attr, &arg);
    
    // Start execution
    sysSpuThreadGroupStart(group_id);
    
    // Wait for completion (busy wait on result flag)
    while (spu_result == 0);
    
    u32 cause, status;
    sysSpuThreadGroupJoin(group_id, &cause, &status);
    sysSpuImageClose(&image);
    
    printf("Output text: %s\n", spu_text);
    
    return 0;
}

DMA Tag Management

Tags allow tracking multiple concurrent DMA transfers:
Multiple Concurrent Transfers
#define TAG_INPUT  1
#define TAG_OUTPUT 2
#define TAG_TEMP   3

// Issue multiple DMA operations
mfc_get(input_buffer, input_ea, 1024, TAG_INPUT, 0, 0);
mfc_get(temp_buffer, temp_ea, 512, TAG_TEMP, 0, 0);

// Wait for specific tag
mfc_write_tag_mask(1 << TAG_INPUT);
mfc_read_tag_status_all();  // Wait for TAG_INPUT only

// Process input_buffer while temp_buffer loads...

// Wait for all tags
mfc_write_tag_mask((1 << TAG_INPUT) | (1 << TAG_TEMP) | (1 << TAG_OUTPUT));
mfc_read_tag_status_all();

Tag Wait Modes

Wait for all specified tags to complete
mfc_write_tag_mask(0x07);  // Tags 0, 1, 2
mfc_read_tag_status_all(); // Waits for all 3

Double Buffering

Overlap computation with data transfer for maximum performance:
Double Buffer Pattern
#define TAG_A 1
#define TAG_B 2
#define BLOCK_SIZE 4096

uint8_t buffer_a[BLOCK_SIZE] __attribute__((aligned(128)));
uint8_t buffer_b[BLOCK_SIZE] __attribute__((aligned(128)));

uint64_t ea_offset = 0;
int current = 0;

// Initial load into buffer A
mfc_get(buffer_a, input_ea, BLOCK_SIZE, TAG_A, 0, 0);
mfc_write_tag_mask(1 << TAG_A);
mfc_read_tag_status_all();

for (int i = 1; i < num_blocks; i++) {
    if (current == 0) {
        // Start loading next block into B while processing A
        mfc_get(buffer_b, input_ea + i * BLOCK_SIZE, 
                BLOCK_SIZE, TAG_B, 0, 0);
        
        // Process buffer A
        process_data(buffer_a, BLOCK_SIZE);
        
        // Wait for B to finish loading
        mfc_write_tag_mask(1 << TAG_B);
        mfc_read_tag_status_all();
        
        current = 1;
    } else {
        // Start loading next block into A while processing B
        mfc_get(buffer_a, input_ea + i * BLOCK_SIZE, 
                BLOCK_SIZE, TAG_A, 0, 0);
        
        // Process buffer B
        process_data(buffer_b, BLOCK_SIZE);
        
        // Wait for A to finish loading
        mfc_write_tag_mask(1 << TAG_A);
        mfc_read_tag_status_all();
        
        current = 0;
    }
}

// Process final buffer
if (current == 1) {
    process_data(buffer_b, BLOCK_SIZE);
}

List DMA

Transfer multiple non-contiguous memory regions in a single operation:
DMA List Transfer
#include <dma/spu_dma.h>

// List element structure
typedef struct {
    uint32_t size;     // Transfer size
    uint32_t ea_low;   // Low 32 bits of effective address
} __attribute__((aligned(8))) dma_list_element;

dma_list_element list[8] __attribute__((aligned(8)));
uint8_t buffer[16384] __attribute__((aligned(16)));

// Setup list to transfer 8 scattered regions
uint64_t base_ea = input_ea;
for (int i = 0; i < 8; i++) {
    list[i].size = 1024;
    list[i].ea_low = (base_ea + i * 2048) & 0xFFFFFFFF;
}

// Perform list DMA (transfers all 8 regions)
spu_dma_list_get(buffer, base_ea >> 32, list, 
                 sizeof(list), TAG, 0, 0);

mfc_write_tag_mask(1 << TAG);
mfc_read_tag_status_all();

Atomic Operations

Safe read-modify-write operations on shared memory:
Atomic Compare and Swap
uint8_t ls_buffer[128] __attribute__((aligned(128)));
uint64_t ea = shared_counter_ea;  // Must be 128-byte aligned

// Get lock line atomically
mfc_getllar(ls_buffer, ea, 0, 0);
mfc_read_atomic_status();

// Modify value
uint32_t *counter = (uint32_t*)ls_buffer;
(*counter)++;

// Conditionally put back (fails if another SPU modified it)
mfc_putllc(ls_buffer, ea, 0, 0);
if (mfc_read_atomic_status() == 0) {
    // Another SPU modified it, retry...
} else {
    // Success!
}

Small Data Transfers

For transferring individual values (1, 2, 4, or 8 bytes):
Typed DMA Transfers
#include <dma/spu_dma.h>

#define TAG 1

// Write single values
spu_dma_put_uint32(0x12345678, counter_ea, TAG, 0, 0);
spu_dma_put_uint64(0xDEADBEEFCAFEBABE, timestamp_ea, TAG, 0, 0);

// Read single values
uint32_t value32 = spu_dma_get_uint32(status_ea, TAG, 0, 0);
uint64_t value64 = spu_dma_get_uint64(result_ea, TAG, 0, 0);
Small transfers have alignment requirements based on size:
  • 1-byte: no alignment required
  • 2-byte: 2-byte aligned
  • 4-byte: 4-byte aligned
  • 8-byte: 8-byte aligned
Local store and effective addresses must have the same low 4 bits.

Large Transfers

Automatically chunk transfers larger than 16 KB:
Large Transfer Example
#include <dma/spu_dma.h>

#define TAG 1

uint8_t large_buffer[65536] __attribute__((aligned(128)));

// Transfer 64 KB (automatically split into multiple 16KB DMAs)
spu_dma_large_get(large_buffer, input_ea, 65536, TAG, 0, 0);

// Wait for all chunks to complete
mfc_write_tag_mask(1 << TAG);
mfc_read_tag_status_all();

DMA and Cache Coherency

Important: DMA transfers bypass the PPU cache. Always use aligned buffers and ensure proper memory barriers.
PPU Side - Proper Alignment
#include <malloc.h>

// Always use memalign for DMA buffers
// Prefer 128-byte alignment for best performance
uint8_t *buffer = memalign(128, data_size);

if (!buffer) {
    printf("Allocation failed\n");
    return -1;
}

// Ensure data is written before SPU accesses it
__sync_synchronize();  // Memory barrier

// Start SPU thread...

// Wait for SPU completion...

// Ensure SPU writes are visible
__sync_synchronize();

free(buffer);

Common DMA Patterns

Scatter-Gather

Process Multiple Regions
// SPU processes multiple scattered input regions
for (int i = 0; i < num_regions; i++) {
    uint64_t ea = base_ea + regions[i].offset;
    uint32_t size = regions[i].size;
    
    mfc_get(buffer, ea, size, TAG, 0, 0);
    mfc_write_tag_mask(1 << TAG);
    mfc_read_tag_status_all();
    
    process_data(buffer, size);
    
    mfc_put(buffer, output_ea + i * size, size, TAG, 0, 0);
}

Pipeline Pattern

Three-Stage Pipeline
#define TAG_LOAD   1
#define TAG_STORE  2

uint8_t buf_in[BLOCK_SIZE]  __attribute__((aligned(128)));
uint8_t buf_out[BLOCK_SIZE] __attribute__((aligned(128)));

for (int i = 0; i < num_blocks; i++) {
    // Stage 1: Load next block
    mfc_get(buf_in, input_ea + i * BLOCK_SIZE, 
            BLOCK_SIZE, TAG_LOAD, 0, 0);
    
    // Stage 2: Wait for load
    mfc_write_tag_mask(1 << TAG_LOAD);
    mfc_read_tag_status_all();
    
    // Stage 3: Process
    process_block(buf_in, buf_out, BLOCK_SIZE);
    
    // Stage 4: Store result
    mfc_put(buf_out, output_ea + i * BLOCK_SIZE, 
            BLOCK_SIZE, TAG_STORE, 0, 0);
}

// Wait for final store
mfc_write_tag_mask(1 << TAG_STORE);
mfc_read_tag_status_all();

Debugging DMA Issues

Symptoms: SPU stops or hangsCauses:
  • Local store or effective address not 16-byte aligned
  • Transfer size not multiple of 16 bytes
Solution:
// Enable verbose assertions during development
#define SPU_DMA_ASSERT_VERBOSE
#include <dma/spu_dma.h>

// Use aligned attribute
uint8_t buffer[1024] __attribute__((aligned(128)));
Symptoms: DMA commands appear to be ignoredCause: More than 16 outstanding DMA commandsSolution:
// Wait for some DMAs before issuing more
mfc_write_tag_mask(0xFFFF);  // All tags
mfc_read_tag_status_any();   // Wait for any to complete
Symptoms: Stale or corrupted dataCause: PPU cache not synchronized with DMASolution:
// PPU side - use memory barriers
__sync_synchronize();

// Or use memalign which returns uncached memory
void *buffer = memalign(128, size);

Performance Tips

Maximize Transfer Size

Use the full 16 KB per DMA command when possible

Double Buffer

Overlap DMA transfers with computation

Minimize Waits

Issue multiple DMAs before waiting

128-byte Alignment

Use 128-byte alignment for best cache performance

Next Steps

Thread Management

Learn SPU thread creation and control

SPURS Framework

High-level task scheduling system

Build docs developers (and LLMs) love