Overview
warp-md supports both CPU and GPU (CUDA) execution for analysis Plans. Device selection is controlled via the device parameter.
Device Specification
Every Plan’s run() method accepts a device parameter:
result = plan.run(
traj,
system,
device = "auto" # "auto" | "cpu" | "cuda" | "cuda:0" | "cuda:1"
)
Device Options
Recommended . Automatically selects:
GPU (CUDA device 0) if available
CPU fallback if GPU unavailable
# Uses GPU if built with CUDA support
rg = plan.run(traj, system, device = "auto" )
Force CPU execution, even if GPU is available. # Always use CPU
rg = plan.run(traj, system, device = "cpu" )
Use GPU device 0. Raises error if CUDA unavailable. # Require GPU
rg = plan.run(traj, system, device = "cuda" )
Use GPU device N (for multi-GPU systems). # Use second GPU
rg = plan.run(traj, system, device = "cuda:1" )
Implementation
From crates/traj-engine/src/executor.rs:9-52:
pub enum Device {
Cpu ,
#[cfg(feature = "cuda" )]
Cuda ( GpuContext ),
}
impl Device {
pub fn from_spec ( spec : & str ) -> TrajResult < Self > {
let spec = spec . trim ();
if spec . eq_ignore_ascii_case ( "cpu" ) {
return Ok ( Device :: Cpu );
}
if spec . eq_ignore_ascii_case ( "auto" ) {
#[cfg(feature = "cuda" )]
{
if let Ok ( ctx ) = GpuContext :: new ( 0 ) {
return Ok ( Device :: Cuda ( ctx ));
}
}
return Ok ( Device :: Cpu );
}
if spec . to_ascii_lowercase () . starts_with ( "cuda" ) {
#[cfg(feature = "cuda" )]
{
let idx = parse_cuda_index ( spec ) ? ;
return Ok ( Device :: Cuda ( GpuContext :: new ( idx ) ? ));
}
#[cfg(not(feature = "cuda" )]
{
return Err ( TrajError :: Unsupported (
"cuda feature disabled; rebuild with --features cuda" . into (),
));
}
}
Err ( TrajError :: Unsupported ( format! ( "unknown device spec '{spec}'" )))
}
}
The device="auto" logic attempts GPU initialization. If it fails (no CUDA, no GPU, driver issue), it silently falls back to CPU.
Checking CUDA Availability
import warp_md
print ( f "Build profile: { warp_md.__rust_build_profile__ } " )
print ( f "CUDA enabled: { warp_md.__rust_cuda_enabled__ } " )
From python/warp_md/__init__.py:145-146:
__rust_build_profile__ = getattr (traj_py, "__rust_build_profile__" , "unknown" )
__rust_cuda_enabled__ = bool ( getattr (traj_py, "__rust_cuda_enabled__" , False ))
>>> warp_md.__rust_cuda_enabled__
True
When to Use GPU
Large Systems (>10k atoms)
GPU shines for systems with many atoms. Memory bandwidth dominates. Example : 50k-atom solvated protein, 10k frames
CPU: ~45 seconds (RMSD)
GPU: ~3 seconds (15x speedup)
Long Trajectories (>1k frames)
Amortizes CUDA kernel launch overhead and memory transfers.
Plans with heavy computation benefit most:
PairwiseRmsdPlan (O(n²) RMSD matrix)
TrajectoryClusterPlan (DBSCAN/k-means)
RdfPlan (pair distance histograms)
GistGridPlan (grid-based energy calculations)
When to Use CPU
Small Systems (<1k atoms)
Transfer overhead dominates. CPU may be faster.
Insufficient work to amortize GPU setup.
Some Plans are I/O or memory-bound, not compute-bound:
Simple distance calculations
Geometric transformations
Memory Management
Chunk Size Tuning
The chunk_frames parameter controls memory usage:
# Small chunks: lower memory, more I/O overhead
rg = plan.run(traj, system, chunk_frames = 64 )
# Large chunks: higher memory, better GPU utilization
rg = plan.run(traj, system, chunk_frames = 512 )
GPU tuning : Use larger chunks (256-1024) to maximize GPU occupancy and amortize transfer costs.
Memory Budget
Some Plans accept memory_budget_bytes:
from warp_md import TrajectoryClusterPlan
plan = TrajectoryClusterPlan(
selection,
method = "dbscan" ,
memory_budget_bytes = 512 * 1024 * 1024 # 512 MB
)
From python/warp_md/tests/test_clustering.py:46-56:
class _DummyPlan :
def __init__ (
self ,
sel ,
method = "dbscan" ,
# ...
memory_budget_bytes = None ,
):
called[ "memory_budget_bytes" ] = memory_budget_bytes
Multi-GPU Support
Selecting GPU by Index
# Use first GPU
rg_0 = plan.run(traj, system, device = "cuda:0" )
# Use second GPU
rg_1 = plan.run(traj, system, device = "cuda:1" )
Parallel Execution
For multiple trajectories, use Python multiprocessing:
from multiprocessing import Pool
def analyze ( traj_path , device ):
system = System.from_pdb( "protein.pdb" )
traj = Trajectory.open_xtc(traj_path, system)
plan = RgPlan(system.select( "backbone" ))
return plan.run(traj, system, device = device)
with Pool( 2 ) as pool:
results = pool.starmap(analyze, [
( "traj1.xtc" , "cuda:0" ),
( "traj2.xtc" , "cuda:1" ),
])
warp-md does not automatically distribute work across GPUs. You must manually assign devices.
Building with CUDA
From Source
# Ensure CUDA toolkit installed (nvcc in PATH)
export CUDA_HOME = / usr / local / cuda
# Build with CUDA support
maturin develop --release --features cuda
# Or install wheel
pip install warp-md # Pre-built wheels include CUDA
Verify CUDA Build
import warp_md
assert warp_md.__rust_cuda_enabled__, "CUDA not enabled"
Pre-built wheels on PyPI include CUDA support (CUDA 11.8+ required). Source builds require nvcc.
Troubleshooting
GPU Not Detected
>>> plan.run(traj, system, device = "cuda" )
RuntimeError : cuda feature disabled; rebuild with -- features cuda
Solution : Install CUDA-enabled build:
pip install --force-reinstall warp-md
Out of Memory (OOM)
>>> plan.run(traj, system, device = "cuda" , chunk_frames = 2048 )
RuntimeError : CUDA out of memory
Solutions :
Reduce chunk_frames
Use memory_budget_bytes if Plan supports it
Fall back to CPU
# Use smaller chunks
rg = plan.run(traj, system, device = "cuda" , chunk_frames = 128 )
Driver Version Mismatch
RuntimeError: CUDA driver version insufficient
Solution : Update NVIDIA drivers to match CUDA toolkit version.
Examples
Auto Device Selection
from warp_md import System, Trajectory, RmsdPlan
system = System.from_pdb( "protein.pdb" )
traj = Trajectory.open_xtc( "trajectory.xtc" , system)
backbone = system.select( "backbone" )
plan = RmsdPlan(backbone, reference = "topology" , align = True )
# Let warp-md choose
rmsd = plan.run(traj, system, device = "auto" )
Force CPU for Reproducibility
# Some CUDA kernels have non-deterministic floating-point rounding
rmsd_cpu = plan.run(traj, system, device = "cpu" )
Explicit GPU with Error Handling
try :
rmsd = plan.run(traj, system, device = "cuda" )
except RuntimeError as e:
print ( f "GPU failed: { e } " )
print ( "Falling back to CPU..." )
rmsd = plan.run(traj, system, device = "cpu" )
See Also