The Device class abstracts computational hardware, allowing tensors and models to run on different devices without code changes. Neurenix automatically detects available hardware and provides a unified API across all backends.
from neurenix.device import Device, DeviceType, get_available_devices# Create device instancescpu = Device(DeviceType.CPU)gpu = Device(DeviceType.CUDA, 0) # First CUDA GPUtpu = Device(DeviceType.TPU, 0) # First TPU# List all available devicesdevices = get_available_devices()for device in devices: print(device)
from neurenix.device import Device, DeviceType# Create with default index (0)cpu = Device(DeviceType.CPU)cuda_0 = Device(DeviceType.CUDA) # Equivalent to CUDA:0# Create with specific indexcuda_1 = Device(DeviceType.CUDA, 1) # Second GPUcuda_2 = Device(DeviceType.CUDA, 2) # Third GPU
from neurenix.tensor import Tensorfrom neurenix.device import Device, DeviceType# Create on CPUtensor = Tensor.randn((1000, 1000))# Move to GPU (creates new tensor)gpu_tensor = tensor.to(Device(DeviceType.CUDA, 0))# Move to different GPUgpu1_tensor = gpu_tensor.to(Device(DeviceType.CUDA, 1))# Move back to CPUcpu_tensor = gpu1_tensor.to(Device(DeviceType.CPU))
from neurenix.tensor import Tensorfrom neurenix.device import Device, DeviceType# Create tensortensor = Tensor.randn((1000, 1000))print(f"Initial: {tensor.device}")# Hot-swap to GPU (modifies in-place)tensor.hot_swap_device(Device(DeviceType.CUDA, 0))print(f"After swap: {tensor.device}")# More memory efficient than .to()
from neurenix.tensor import Tensorfrom neurenix.device import Device, DeviceTypetensor = Tensor.randn((1000, 1000))# Non-blocking transfer (async)gpu_tensor = tensor.to( Device(DeviceType.CUDA, 0), non_blocking=True)# Continue with other operations# GPU transfer happens in background
from neurenix.device import Device, DeviceType, get_device_countfrom neurenix.tensor import Tensor# Get all available CUDA devicesnum_gpus = get_device_count(DeviceType.CUDA)gpus = [Device(DeviceType.CUDA, i) for i in range(num_gpus)]if len(gpus) > 1: # Split batch across GPUs batch_size_per_gpu = total_batch_size // len(gpus) models = [model.clone().to(gpu) for gpu in gpus] for i, (gpu, model) in enumerate(zip(gpus, models)): start_idx = i * batch_size_per_gpu end_idx = start_idx + batch_size_per_gpu batch = Tensor(data[start_idx:end_idx], device=gpu) output = model(batch)