Documentation Index
Fetch the complete documentation index at: https://mintlify.com/MilesONerd/neurenix/llms.txt
Use this file to discover all available pages before exploring further.
Overview
Neurenix provides support for Neural Processing Units (NPUs), specialized hardware accelerators designed for efficient AI inference on edge devices. NPUs offer:
- Power efficiency - Optimized for low-power operation
- Low latency - Dedicated hardware for neural network operations
- Quantization support - INT8/INT16 operations for efficiency
- Edge deployment - Designed for mobile and embedded systems
- Real-time inference - Deterministic performance for edge applications
Supported NPUs
Mobile NPUs
- Apple Neural Engine (A-series, M-series)
- Qualcomm Hexagon DSP/NPU (Snapdragon)
- MediaTek APU (Dimensity)
- Samsung NPU (Exynos)
- Google Edge TPU
Embedded NPUs
- ARM Ethos-U55, U65
- Intel Movidius Myriad X
- NVIDIA Deep Learning Accelerator (DLA)
- Hailo-8, Hailo-15
- Kneron KL series
Installation
# Install Neurenix with NPU support
pip install neurenix-npu
# Build from source
export NEURENIX_WITH_NPU=1
pip install -e .
# Install vendor-specific SDK (example for Edge TPU)
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
echo "deb https://packages.cloud.google.com/apt coral-edgetpu-stable main" | sudo tee /etc/apt/sources.list.d/coral-edgetpu.list
sudo apt-get update
sudo apt-get install libedgetpu1-std python3-pycoral
Device Management
Check NPU Availability
import neurenix as nx
# Check NPU availability
if nx.npu.is_available():
print("NPU is available")
print(f"NPU devices: {nx.npu.device_count()}")
else:
print("NPU not available")
// C++ NPU detection
#include "phynexus/hardware/npu.h"
using namespace phynexus::hardware;
if (initialize_npu()) {
int device_count = get_npu_device_count();
std::cout << "NPU devices: " << device_count << std::endl;
}
Get Device Properties
# Get NPU properties
props = nx.npu.get_device_properties(0)
print(f"Device: {props.name}")
print(f"Memory: {props.total_memory / (1024**2):.2f} MB")
print(f"Supported operations: {props.supported_ops}")
print(f"Max batch size: {props.max_batch_size}")
print(f"Quantization support: {props.quantization_support}")
// C++ device properties
auto props = get_npu_device_properties(0);
std::cout << "Device: " << props.name << std::endl;
std::cout << "Memory: " << props.total_memory / (1024*1024) << " MB" << std::endl;
Set Current Device
# Set NPU device
nx.npu.set_device(0)
# Get current device
current = nx.npu.current_device()
print(f"Current NPU: {current}")
# Create tensors on NPU
device = nx.Device.npu(0)
tensor = nx.randn(100, 100, device=device)
Model Compilation
Compile for NPU
import neurenix as nx
from neurenix.npu import compile_for_npu
# Load model
model = nx.load('model.pt')
# Compile for NPU
npu_model = compile_for_npu(
model,
input_shape=(1, 3, 224, 224),
quantization='int8',
optimization_level=3,
target_npu='auto' # Auto-detect NPU
)
# Run inference on NPU
input_tensor = nx.randn(1, 3, 224, 224)
output = npu_model(input_tensor)
Supported Operations
# Check which operations are supported on NPU
from neurenix.npu import get_supported_operations
supported_ops = get_supported_operations()
print("Supported operations:")
for op in supported_ops:
print(f" - {op}")
# Check if specific operation is supported
if 'conv2d' in supported_ops:
print("Convolution is supported")
Quantization
Overview
NPUs typically require quantized models for optimal performance:
from neurenix.quantization import quantize_model
# Quantize model for NPU
quantized_model = quantize_model(
model,
calibration_data=calibration_loader,
quantization_scheme='int8',
target='npu'
)
# Compile quantized model for NPU
npu_model = compile_for_npu(quantized_model, input_shape=(1, 3, 224, 224))
Post-Training Quantization
from neurenix.quantization import quantize_dynamic, quantize_static
# Dynamic quantization (no calibration needed)
dynamic_model = quantize_dynamic(
model,
dtype=nx.qint8,
target='npu'
)
# Static quantization (requires calibration)
static_model = quantize_static(
model,
calibration_data=calibration_loader,
dtype=nx.qint8
)
Quantization-Aware Training
from neurenix.quantization import QuantizationConfig, prepare_qat
# Configure quantization
qat_config = QuantizationConfig(
activation_dtype=nx.quint8,
weight_dtype=nx.qint8,
per_channel=True
)
# Prepare model for QAT
qat_model = prepare_qat(model, qat_config)
# Train with quantization
for epoch in range(num_epochs):
for batch in dataloader:
output = qat_model(batch)
loss = criterion(output, target)
loss.backward()
optimizer.step()
# Convert to quantized model
quantized_model = qat_model.convert()
Memory Management
Allocate Memory
# Allocate tensor on NPU
tensor = nx.zeros((100, 100), device='npu:0')
# Check memory usage
memory_info = nx.npu.memory_info()
print(f"Used memory: {memory_info.used / (1024**2):.2f} MB")
print(f"Free memory: {memory_info.free / (1024**2):.2f} MB")
// C++ NPU memory management
void* ptr = npu_malloc(1024 * 1024 * sizeof(float));
// ... use memory ...
npu_free(ptr);
Memory Transfer
import numpy as np
# Host to NPU
data = np.random.randn(100, 100).astype(np.float32)
tensor_npu = nx.from_numpy(data, device='npu')
# NPU to host
data_back = tensor_npu.cpu().numpy()
# Asynchronous transfer
stream = nx.npu.Stream()
with stream:
tensor_npu = data_tensor.to('npu', non_blocking=True)
// C++ memory transfer
float* host_data = new float[size];
void* npu_data = npu_malloc(size * sizeof(float));
// Host to NPU
npu_memcpy_host_to_device(npu_data, host_data, size * sizeof(float));
// NPU to host
npu_memcpy_device_to_host(host_data, npu_data, size * sizeof(float));
Edge TPU
Setup
from neurenix.hardware import EdgeTPU
# Initialize Edge TPU
edge_tpu = EdgeTPU()
edge_tpu.initialize()
print(f"Edge TPU version: {edge_tpu.version()}")
print(f"Available Edge TPUs: {edge_tpu.device_count()}")
Compile Model
# Compile model for Edge TPU
from neurenix.edgetpu import compile_for_edgetpu
tflite_model = 'model.tflite'
edgetpu_model = compile_for_edgetpu(
tflite_model,
output_path='model_edgetpu.tflite'
)
# Load and run on Edge TPU
model = EdgeTPU.load_model(edgetpu_model)
output = model.predict(input_data)
Apple Neural Engine
Core ML Conversion
import neurenix as nx
from neurenix.coreml import convert_to_coreml
# Convert to Core ML
model = nx.load('model.pt')
coreml_model = convert_to_coreml(
model,
input_shape=(1, 3, 224, 224),
minimum_deployment_target='iOS15'
)
# Save Core ML model
coreml_model.save('model.mlmodel')
Neural Engine Optimization
# Optimize for Neural Engine
from neurenix.coreml import optimize_for_neural_engine
optimized_model = optimize_for_neural_engine(
coreml_model,
compute_precision='float16'
)
Qualcomm Hexagon NPU
SNPE Integration
from neurenix.snpe import compile_for_snpe
# Compile for Snapdragon NPU
snpe_model = compile_for_snpe(
model,
input_shape=(1, 3, 224, 224),
runtime='dsp', # Use Hexagon DSP
quantization='int8'
)
# Export DLC (Deep Learning Container)
snpe_model.export('model.dlc')
Batch Processing
# Process multiple inputs efficiently
batch_inputs = [input1, input2, input3, input4]
# Batch inference on NPU
outputs = npu_model.batch_predict(batch_inputs)
Model Optimization
from neurenix.npu import optimize_model
# Optimize model for NPU
optimized_model = optimize_model(
model,
input_shape=(1, 3, 224, 224),
optimization_passes=[
'remove_unused_ops',
'fold_batch_norm',
'fuse_conv_bn',
'quantize_weights'
]
)
Operator Fusion
# Enable operator fusion
npu_model = compile_for_npu(
model,
input_shape=(1, 3, 224, 224),
enable_fusion=True
)
Profiling and Debugging
NPU Profiling
# Profile NPU operations
with nx.npu.profiler.profile():
output = npu_model(input)
# Get profiling results
profile_data = nx.npu.profiler.get_profile_data()
for op in profile_data:
print(f"{op.name}: {op.duration_ms:.2f} ms")
Benchmark
from neurenix.benchmark import benchmark_npu
# Benchmark NPU inference
results = benchmark_npu(
npu_model,
input_shape=(1, 3, 224, 224),
num_iterations=100,
warmup_iterations=10
)
print(f"Average latency: {results.mean_latency_ms:.2f} ms")
print(f"Throughput: {results.throughput_fps:.2f} fps")
print(f"Power consumption: {results.power_mw:.2f} mW")
Deployment
Export Model
# Export NPU model for deployment
npu_model.export(
'model_npu.bin',
include_metadata=True,
optimize_size=True
)
Mobile Integration
# Export for Android
from neurenix.mobile import export_for_android
export_for_android(
npu_model,
output_path='model_android.tflite',
use_npu=True
)
# Export for iOS
from neurenix.mobile import export_for_ios
export_for_ios(
npu_model,
output_path='model_ios.mlmodel',
use_neural_engine=True
)
Environment Variables
# NPU device selection
export NPU_DEVICE_COUNT=1
export NEURENIX_NPU_DEVICE=0
# Enable NPU debugging
export NEURENIX_NPU_DEBUG=1
# Set performance mode
export NEURENIX_NPU_PERFORMANCE_MODE=high # high, balanced, low_power
# Edge TPU
export CORAL_VISIBLE_DEVICES=0
Common Use Cases
Real-Time Object Detection
import neurenix as nx
from neurenix.models import detection
# Load detection model
model = detection.mobilenet_ssd(pretrained=True)
# Compile for NPU
npu_model = compile_for_npu(
model,
input_shape=(1, 3, 300, 300),
quantization='int8'
)
# Real-time inference
import cv2
cap = cv2.VideoCapture(0)
while True:
ret, frame = cap.read()
if not ret:
break
# Preprocess
input_tensor = preprocess(frame)
# NPU inference
detections = npu_model(input_tensor)
# Visualize
draw_detections(frame, detections)
cv2.imshow('Detection', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
Edge Classification
# Load classification model
from neurenix.models import mobilenet_v3_small
model = mobilenet_v3_small(pretrained=True)
model.eval()
# Compile for NPU with quantization
npu_model = compile_for_npu(
model,
input_shape=(1, 3, 224, 224),
quantization='int8',
calibration_data=calibration_loader
)
# Run inference
with nx.no_grad():
output = npu_model(input_image)
prediction = output.argmax(dim=1)
See Also