SPUs cannot directly access main memory. All data transfer between main memory and SPU local store must be performed via DMA (Direct Memory Access) using the Memory Flow Controller (MFC).
The PSL1GHT spu_dma.h library provides high-level wrappers with automatic alignment checking and error detection over the raw MFC intrinsics.
The dma/spu_dma.h header provides safe wrappers with automatic checking:
Safe DMA Functions
#include <dma/spu_dma.h>// Normal transfers (16-byte aligned, size multiple of 16)spu_dma_get(ls, ea, size, tag, 0, 0);spu_dma_put(ls, ea, size, tag, 0, 0);// Barrier/fence variantsspu_dma_getb(ls, ea, size, tag, 0, 0); // GET with barrierspu_dma_putf(ls, ea, size, tag, 0, 0); // PUT with fence// Small transfers (1, 2, 4, or 8 bytes)spu_dma_small_get(ls, ea, 4, tag, 0, 0);spu_dma_small_put(ls, ea, 4, tag, 0, 0);// Large transfers (> 16 KB, automatically chunked)spu_dma_large_get(ls, ea, 32768, tag, 0, 0);spu_dma_large_put(ls, ea, 32768, tag, 0, 0);
The high-level wrappers include assertions that will halt the SPU if alignment or size constraints are violated. In production, define NO_SPU_DMA_ASSERT to disable checks.
Overlap computation with data transfer for maximum performance:
Double Buffer Pattern
#define TAG_A 1#define TAG_B 2#define BLOCK_SIZE 4096uint8_t buffer_a[BLOCK_SIZE] __attribute__((aligned(128)));uint8_t buffer_b[BLOCK_SIZE] __attribute__((aligned(128)));uint64_t ea_offset = 0;int current = 0;// Initial load into buffer Amfc_get(buffer_a, input_ea, BLOCK_SIZE, TAG_A, 0, 0);mfc_write_tag_mask(1 << TAG_A);mfc_read_tag_status_all();for (int i = 1; i < num_blocks; i++) { if (current == 0) { // Start loading next block into B while processing A mfc_get(buffer_b, input_ea + i * BLOCK_SIZE, BLOCK_SIZE, TAG_B, 0, 0); // Process buffer A process_data(buffer_a, BLOCK_SIZE); // Wait for B to finish loading mfc_write_tag_mask(1 << TAG_B); mfc_read_tag_status_all(); current = 1; } else { // Start loading next block into A while processing B mfc_get(buffer_a, input_ea + i * BLOCK_SIZE, BLOCK_SIZE, TAG_A, 0, 0); // Process buffer B process_data(buffer_b, BLOCK_SIZE); // Wait for A to finish loading mfc_write_tag_mask(1 << TAG_A); mfc_read_tag_status_all(); current = 0; }}// Process final bufferif (current == 1) { process_data(buffer_b, BLOCK_SIZE);}
Safe read-modify-write operations on shared memory:
Atomic Compare and Swap
uint8_t ls_buffer[128] __attribute__((aligned(128)));uint64_t ea = shared_counter_ea; // Must be 128-byte aligned// Get lock line atomicallymfc_getllar(ls_buffer, ea, 0, 0);mfc_read_atomic_status();// Modify valueuint32_t *counter = (uint32_t*)ls_buffer;(*counter)++;// Conditionally put back (fails if another SPU modified it)mfc_putllc(ls_buffer, ea, 0, 0);if (mfc_read_atomic_status() == 0) { // Another SPU modified it, retry...} else { // Success!}
#include <dma/spu_dma.h>#define TAG 1uint8_t large_buffer[65536] __attribute__((aligned(128)));// Transfer 64 KB (automatically split into multiple 16KB DMAs)spu_dma_large_get(large_buffer, input_ea, 65536, TAG, 0, 0);// Wait for all chunks to completemfc_write_tag_mask(1 << TAG);mfc_read_tag_status_all();
Important: DMA transfers bypass the PPU cache. Always use aligned buffers and ensure proper memory barriers.
PPU Side - Proper Alignment
#include <malloc.h>// Always use memalign for DMA buffers// Prefer 128-byte alignment for best performanceuint8_t *buffer = memalign(128, data_size);if (!buffer) { printf("Allocation failed\n"); return -1;}// Ensure data is written before SPU accesses it__sync_synchronize(); // Memory barrier// Start SPU thread...// Wait for SPU completion...// Ensure SPU writes are visible__sync_synchronize();free(buffer);