Skip to main content

Overview

The state management system uses Pydantic models to provide type-safe, validated tracking of all experiment data. All state is serializable to JSON for persistence and resumption.

Core Models

ExperimentState

The root state model that tracks the entire experiment session.
from src.orchestration.state import ExperimentState, create_initial_state

# Create new state
state = create_initial_state(
    data_path="data.csv",
    target_column="target",
    task_type="classification",
    constraints="Use interpretable models",
    max_iterations=20,
    time_budget=3600
)

# Save state
state.save(Path("state.json"))

# Load state
state = ExperimentState.load(Path("state.json"))
session_id
str
Unique identifier (8-char UUID prefix). Auto-generated.
config
ExperimentConfig
Experiment configuration settings
data_profile
Optional[DataProfile]
Dataset profile from DataProfiler
experiments
list[ExperimentResult]
List of all experiment results
current_iteration
int
Current iteration number (0-based)
phase
ExperimentPhase
Current phase of execution
best_metric
Optional[float]
Best metric value achieved so far
best_experiment
Optional[str]
Name of best-performing experiment
iterations_without_improvement
int
Count for plateau detection
start_time
float
Unix timestamp of session start
gemini_conversation_history
list[ConversationEntry]
Full Gemini conversation for context
termination_reason
Optional[str]
Why the experiment stopped

Methods

add_experiment(result: ExperimentResult) Add an experiment result and update tracking.
state.add_experiment(result)
# Automatically:
# - Increments current_iteration
# - Updates best_metric and best_experiment if improved
# - Tracks iterations_without_improvement
should_terminate() -> tuple[bool, str] Check if experiment loop should stop.
should_stop, reason = state.should_terminate()
if should_stop:
    print(f"Stopping: {reason}")
Returns True if:
  • Max iterations reached
  • Time budget exhausted
  • Plateau detected (3+ iterations without improvement)
  • Target metric achieved
  • Agent recommends stopping
get_elapsed_time() -> float Get elapsed time in seconds.
elapsed = state.get_elapsed_time()
print(f"Running for {elapsed:.1f}s")
get_summary() -> dict Get summary of current state.
summary = state.get_summary()
# {
#   'session_id': 'abc123',
#   'phase': 'experiment_design',
#   'current_iteration': 5,
#   'max_iterations': 20,
#   'elapsed_time': 450.3,
#   'best_metric': 0.876,
#   'best_experiment': 'rf_tuned',
#   'total_experiments': 6,
#   'successful_experiments': 5
# }
save(path: Path) Save state to JSON file.
state.save(Path("state_abc123.json"))
load(path: Path) -> ExperimentState (classmethod) Load state from JSON file.
state = ExperimentState.load(Path("state_abc123.json"))

ExperimentConfig

Configuration for the experiment session.
from src.orchestration.state import ExperimentConfig, TaskType

config = ExperimentConfig(
    data_path="data/housing.csv",
    target_column="price",
    task_type=TaskType.REGRESSION,
    constraints="Focus on linear models",
    max_iterations=15,
    time_budget=1800,
    plateau_threshold=3,
    improvement_threshold=0.005,
    target_metric_value=0.85,
    primary_metric="rmse"
)
data_path
str
Path to dataset file
target_column
str
Name of target column
task_type
TaskType
CLASSIFICATION or REGRESSION
constraints
Optional[str]
Natural language constraints
max_iterations
int
Maximum iterations (default: 20)
time_budget
int
Time budget in seconds (default: 3600)
plateau_threshold
int
Iterations without improvement before stopping (default: 3)
improvement_threshold
float
Minimum relative improvement to count as progress (default: 0.005 = 0.5%)
target_metric_value
Optional[float]
Target metric to achieve (stops when reached)
primary_metric
Optional[str]
Primary metric to optimize (e.g., “rmse”, “f1”, “accuracy”)

ExperimentResult

Results from a single experiment run.
from src.orchestration.state import ExperimentResult, PreprocessingConfig

result = ExperimentResult(
    experiment_name="rf_baseline",
    iteration=1,
    model_type="RandomForestClassifier",
    model_params={"n_estimators": 100, "max_depth": 10},
    preprocessing=PreprocessingConfig(),
    metrics={"accuracy": 0.87, "f1": 0.85},
    hypothesis="Random forest will handle non-linear patterns",
    reasoning="Data shows complex interactions",
    execution_time=45.3,
    success=True,
    code_path="experiments/rf_baseline.py"
)
experiment_name
str
Unique experiment identifier
iteration
int
Iteration number
model_type
str
sklearn model class name
model_params
dict[str, Any]
Model hyperparameters
preprocessing
PreprocessingConfig
Preprocessing configuration
metrics
dict[str, float]
Performance metrics
hypothesis
str
Hypothesis tested
reasoning
str
Design reasoning
execution_time
float
Wall clock time in seconds
success
bool
Whether experiment succeeded
error_message
Optional[str]
Error message if failed
code_path
Optional[str]
Path to generated script
timestamp
datetime
When experiment ran

Methods

get_primary_metric(metric_name: str) -> Optional[float]
rmse = result.get_primary_metric("rmse")
if rmse is not None:
    print(f"RMSE: {rmse:.4f}")

ExperimentSpec

Specification for designing an experiment.
from src.orchestration.state import ExperimentSpec, PreprocessingConfig

spec = ExperimentSpec(
    experiment_name="xgb_tuned",
    hypothesis="XGBoost with depth limit will reduce overfitting",
    model_type="XGBClassifier",
    model_params={
        "n_estimators": 200,
        "max_depth": 6,
        "learning_rate": 0.05
    },
    preprocessing=PreprocessingConfig(
        missing_values="median",
        scaling="standard",
        encoding="onehot"
    ),
    reasoning="Previous deep trees showed overfitting on validation set"
)
experiment_name
str
Unique name for this experiment
hypothesis
str
Testable hypothesis
model_type
str
sklearn/xgboost/lightgbm model class
model_params
dict[str, Any]
Hyperparameter dictionary
preprocessing
PreprocessingConfig
Data preprocessing settings
reasoning
str
Why this experiment was designed

PreprocessingConfig

Configuration for data preprocessing.
from src.orchestration.state import PreprocessingConfig

preproc = PreprocessingConfig(
    missing_values="median",
    scaling="standard",
    encoding="onehot",
    target_transform="log"
)
missing_values
str
Strategy: 'drop', 'mean', 'median', 'mode', 'constant'
scaling
str
Scaling: 'standard', 'minmax', 'none'
encoding
str
Categorical encoding: 'onehot', 'ordinal'
target_transform
Optional[str]
Target transformation: 'log', 'none', or None

DataProfile

Comprehensive dataset profile.
from src.orchestration.state import DataProfile

profile = DataProfile(
    n_rows=1000,
    n_columns=15,
    columns=["age", "income", "target"],
    column_types={"age": "int64", "income": "float64"},
    numeric_columns=["age", "income"],
    categorical_columns=["city", "category"],
    target_column="target",
    target_type="categorical",
    missing_values={"age": 10, "income": 5},
    missing_percentages={"age": 1.0, "income": 0.5},
    numeric_stats={...},
    categorical_stats={...},
    target_stats={...}
)
See DataProfiler API for detailed field descriptions.

Analysis Models

AnalysisResult

Analysis of experiment results.
from src.orchestration.state import AnalysisResult, MetricComparison, TrendPattern

analysis = AnalysisResult(
    experiment_name="xgb_tuned",
    iteration=5,
    success=True,
    primary_metric=MetricComparison(
        metric_name="rmse",
        current_value=4500.0,
        baseline_value=6000.0,
        best_value=4500.0,
        change_from_baseline_pct=-25.0,
        is_improvement=True,
        is_new_best=True
    ),
    trend_pattern=TrendPattern.IMPROVING,
    key_observations=[
        "New best RMSE achieved",
        "25% improvement over baseline",
        "Consistent improvement trend"
    ],
    reasoning="XGBoost depth limiting successfully reduced overfitting"
)
experiment_name
str
Experiment being analyzed
iteration
int
Iteration number
success
bool
Whether experiment succeeded
primary_metric
Optional[MetricComparison]
Detailed metric comparison
trend_pattern
TrendPattern
Detected trend: IMPROVING, DEGRADING, PLATEAU, FLUCTUATING, INITIAL
key_observations
list[str]
Important insights
reasoning
str
Detailed analysis reasoning

HypothesisSet

Set of hypotheses for next iteration.
from src.orchestration.state import HypothesisSet, Hypothesis

hypotheses = HypothesisSet(
    iteration=6,
    analysis_summary="XGBoost achieved best results, try further tuning",
    hypotheses=[
        Hypothesis(
            hypothesis_id="h1",
            statement="Lower learning rate will improve generalization",
            rationale="Current model may be learning too fast",
            suggested_model="XGBClassifier",
            suggested_params={"learning_rate": 0.01, "n_estimators": 500},
            confidence_score=0.85,
            priority=1
        ),
        Hypothesis(
            hypothesis_id="h2",
            statement="Feature engineering will help linear models",
            rationale="Linear models underperformed, may need feature interactions",
            suggested_model="LogisticRegression",
            confidence_score=0.60,
            priority=2
        )
    ],
    exploration_vs_exploitation="exploit",
    reasoning="Focus on tuning XGBoost since it's clearly best"
)

# Get top hypothesis
top = hypotheses.get_top_hypothesis()
print(top.statement)
iteration
int
Iteration these hypotheses are for
analysis_summary
str
Summary of what led to these hypotheses
hypotheses
list[Hypothesis]
List of testable hypotheses
exploration_vs_exploitation
str
Strategy: 'explore', 'exploit', 'balanced'
reasoning
str
Why these hypotheses were generated

Enums

TaskType

from src.orchestration.state import TaskType

TaskType.CLASSIFICATION  # "classification"
TaskType.REGRESSION      # "regression"

ExperimentPhase

from src.orchestration.state import ExperimentPhase

ExperimentPhase.INITIALIZING
ExperimentPhase.DATA_PROFILING
ExperimentPhase.BASELINE_MODELING
ExperimentPhase.EXPERIMENT_DESIGN
ExperimentPhase.CODE_GENERATION
ExperimentPhase.EXPERIMENT_EXECUTION
ExperimentPhase.RESULTS_ANALYSIS
ExperimentPhase.HYPOTHESIS_GENERATION
ExperimentPhase.REPORT_GENERATION
ExperimentPhase.COMPLETED
ExperimentPhase.FAILED

TrendPattern

from src.orchestration.state import TrendPattern

TrendPattern.IMPROVING     # Metrics getting better
TrendPattern.DEGRADING     # Metrics getting worse
TrendPattern.PLATEAU       # No significant change
TrendPattern.FLUCTUATING   # Unstable performance
TrendPattern.INITIAL       # Not enough data

Helper Functions

create_initial_state()

Create a new experiment state.
from src.orchestration.state import create_initial_state

state = create_initial_state(
    data_path="data.csv",
    target_column="target",
    task_type="classification",
    constraints="Use interpretable models",
    max_iterations=20,
    time_budget=3600,
    output_dir="outputs/"
)

Complete Example

from pathlib import Path
from src.orchestration.state import (
    create_initial_state,
    ExperimentResult,
    PreprocessingConfig
)

# Create initial state
state = create_initial_state(
    data_path="housing.csv",
    target_column="price",
    task_type="regression",
    max_iterations=10,
    time_budget=1800
)

# Add baseline experiment
baseline = ExperimentResult(
    experiment_name="baseline",
    iteration=0,
    model_type="LinearRegression",
    metrics={"rmse": 6000, "r2": 0.65},
    success=True,
    execution_time=10.5
)
state.add_experiment(baseline)

print(f"Best RMSE: {state.best_metric}")
print(f"Best experiment: {state.best_experiment}")

# Check termination
should_stop, reason = state.should_terminate()
if not should_stop:
    # Continue with next iteration
    pass

# Save state
state.save(Path("state.json"))

# Later: load and resume
resumed_state = ExperimentState.load(Path("state.json"))
print(f"Resuming from iteration {resumed_state.current_iteration}")

Source Location

~/workspace/source/src/orchestration/state.py

Build docs developers (and LLMs) love