Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/opensandbox-group/OpenSandbox/llms.txt

Use this file to discover all available pages before exploring further.

Running reinforcement learning experiments directly on a development machine or shared server creates environment conflicts, resource contention, and reproducibility problems. OpenSandbox solves this by provisioning a clean container for each training run, installing RL dependencies from a requirements.txt at runtime, executing the training script, and making the model checkpoint and JSON summary available through the sandbox file API when training completes.

Prerequisites

1

Start the OpenSandbox server

uv pip install opensandbox-server
opensandbox-server init-config ~/.sandbox.toml --example docker
opensandbox-server
2

Install the Python SDK

uv pip install opensandbox

Environment Variables

VariableDefaultDescription
SANDBOX_DOMAINlocalhost:8080Sandbox service address
SANDBOX_API_KEY(optional)API key if your server requires authentication
SANDBOX_IMAGEsandbox-registry…/code-interpreter:v1.1.0Docker image to use
RL_TIMESTEPS5000Number of training timesteps to run

RL Dependencies

The training script installs these packages inside the sandbox at runtime:
gymnasium==0.29.1
stable-baselines3==2.3.2
tensorboard==2.16.2
torch==2.12.0

Full Example

The script writes a requirements.txt and the training script (train.py) into the sandbox, installs dependencies, runs training, and reads training_summary.json back to the host. The training script itself is generated as an inline string so no external files need to be present on the host.
import asyncio
import os
import textwrap
from datetime import timedelta
from pathlib import Path

from opensandbox import Sandbox
from opensandbox.config import ConnectionConfig


def _load_requirements() -> str:
    requirements_path = Path(__file__).with_name("requirements.txt")
    return requirements_path.read_text(encoding="utf-8")


def _training_script() -> str:
    return textwrap.dedent(
        """
        import json
        import os

        import gymnasium as gym
        from stable_baselines3 import DQN
        from stable_baselines3.common.evaluation import evaluate_policy

        timesteps = int(os.getenv("RL_TIMESTEPS", "5000"))
        tensorboard_log = os.getenv("RL_TENSORBOARD_LOG", "runs")

        env = gym.make("CartPole-v1")
        model = DQN(
            "MlpPolicy",
            env,
            verbose=1,
            tensorboard_log=tensorboard_log,
            learning_rate=1e-3,
            buffer_size=10000,
            learning_starts=1000,
            batch_size=32,
            train_freq=4,
            gradient_steps=1,
        )

        model.learn(total_timesteps=timesteps)

        os.makedirs("checkpoints", exist_ok=True)
        checkpoint_path = "checkpoints/cartpole_dqn"
        model.save(checkpoint_path)

        mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)
        summary = {
            "timesteps": timesteps,
            "mean_reward": float(mean_reward),
            "std_reward": float(std_reward),
            "checkpoint_path": f"{checkpoint_path}.zip",
        }
        with open("training_summary.json", "w", encoding="utf-8") as handle:
            json.dump(summary, handle, indent=2)

        print("Training summary:", summary)
        env.close()
        """
    ).lstrip()


async def _print_execution_logs(execution) -> None:
    for msg in execution.logs.stdout:
        print(f"[stdout] {msg.text}")
    for msg in execution.logs.stderr:
        print(f"[stderr] {msg.text}")
    if execution.error:
        print(f"[error] {execution.error.name}: {execution.error.value}")


def _execution_failed(execution) -> bool:
    return execution.error is not None


async def _run_command(sandbox: Sandbox, command: str) -> bool:
    execution = await sandbox.commands.run(command)
    await _print_execution_logs(execution)
    return not _execution_failed(execution)


def _with_python_env(command: str) -> str:
    return (
        "bash -lc '"
        "source /opt/code-interpreter/code-interpreter-env.sh "
        "python ${PYTHON_VERSION:-3.14} >/dev/null "
        "&& "
        f"{command}"
        "'"
    )


async def _ensure_pip(sandbox: Sandbox) -> bool:
    bootstrap_commands = [
        _with_python_env("python3 -m pip --version"),
        _with_python_env("python3 -m ensurepip --upgrade"),
        "apt-get update && apt-get install -y python3-pip",
        "apk add --no-cache py3-pip",
    ]
    for command in bootstrap_commands:
        if await _run_command(sandbox, command):
            return True
    return False


async def _install_requirements(sandbox: Sandbox) -> bool:
    install_commands = [
        _with_python_env(
            "python3 -m pip install --no-cache-dir --break-system-packages -r requirements.txt"
        ),
        "pip3 install --no-cache-dir -r requirements.txt",
        "pip install --no-cache-dir -r requirements.txt",
    ]
    for command in install_commands:
        if await _run_command(sandbox, command):
            return True
    return False


async def main() -> None:
    domain = os.getenv("SANDBOX_DOMAIN", "localhost:8080")
    api_key = os.getenv("SANDBOX_API_KEY")
    image = os.getenv(
        "SANDBOX_IMAGE",
        "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/code-interpreter:v1.1.0",
    )
    timesteps = os.getenv("RL_TIMESTEPS", "5000")

    config = ConnectionConfig(
        domain=domain,
        api_key=api_key,
        request_timeout=timedelta(minutes=10),
    )

    sandbox = await Sandbox.create(
        image,
        connection_config=config,
        env={"RL_TIMESTEPS": timesteps},
    )

    async with sandbox:
        try:
            await sandbox.files.write_file("requirements.txt", _load_requirements())
            if not await _ensure_pip(sandbox):
                print("Failed to bootstrap pip inside the sandbox.")
                return

            if not await _install_requirements(sandbox):
                print("Failed to install RL dependencies inside the sandbox.")
                return

            await sandbox.files.write_file("train.py", _training_script())
            train_exec = await sandbox.commands.run(_with_python_env("python3 train.py"))
            await _print_execution_logs(train_exec)
            if _execution_failed(train_exec):
                print("Training failed inside the sandbox.")
                return

            try:
                summary = await sandbox.files.read_file("training_summary.json")
            except Exception as exc:
                print(f"\nFailed to read training summary: {exc}")
            else:
                print("\n=== Training summary ===")
                print(summary)
        finally:
            await sandbox.kill()


if __name__ == "__main__":
    asyncio.run(main())
Run the example:
uv run python examples/rl-training/main.py

How Checkpoints Are Saved and Retrieved

1

Training

stable_baselines3.DQN.learn() trains a policy for RL_TIMESTEPS steps on the CartPole-v1 environment. TensorBoard event files are written to the runs/ directory inside the sandbox.
2

Checkpoint save

After training, model.save("checkpoints/cartpole_dqn") writes the model weights to checkpoints/cartpole_dqn.zip inside the sandbox working directory.
3

Evaluation and summary

evaluate_policy() runs 5 evaluation episodes and records mean_reward and std_reward. The summary (including the checkpoint path) is written to training_summary.json.
4

Retrieve from host

sandbox.files.read_file("training_summary.json") reads the JSON summary back to the host. Use sandbox.files.read_bytes("checkpoints/cartpole_dqn.zip") to download the model checkpoint to a local file.

TensorBoard

The training script logs to runs/ inside the sandbox. To inspect training metrics, open a shell in the sandbox and start TensorBoard:
tensorboard --logdir runs --host 0.0.0.0 --port 6006
Then retrieve the TensorBoard endpoint with sandbox.get_endpoint(6006).

References

Build docs developers (and LLMs) love