The evaluator is a Python module that defines an evaluate() function. SkyDiscover calls this function with the path to each generated program and uses the returned metrics to guide evolution.
def evaluate(program_path: str) -> dict: """ Evaluate a generated program and return metrics. Args: program_path: Path to the program file (.py for code, .txt for prompts) Returns: Dictionary with at least 'combined_score' key """ # 1. Load and execute the program # 2. Run tests or compute metrics # 3. Return scores return { "combined_score": 0.85, # Required: higher is better "accuracy": 0.90, # Optional: task-specific metrics "speed": 1.2, # Optional: runtime in seconds }
The combined_score key is required and must be a float. Higher values indicate better solutions.
import numpy as npimport subprocessimport sysimport pickleimport tempfileimport osdef validate_packing(centers, radii): """Check that circles don't overlap and are inside unit square.""" n = centers.shape[0] # Check for NaN values if np.isnan(centers).any() or np.isnan(radii).any(): return False # Check if radii are nonnegative if (radii < 0).any(): return False # Check if circles are inside the unit square for i in range(n): x, y = centers[i] r = radii[i] if x - r < -1e-6 or x + r > 1 + 1e-6 or \ y - r < -1e-6 or y + r > 1 + 1e-6: return False # Check for overlaps for i in range(n): for j in range(i + 1, n): dist = np.sqrt(np.sum((centers[i] - centers[j]) ** 2)) if dist < radii[i] + radii[j] - 1e-6: return False return Truedef run_with_timeout(program_path, timeout_seconds=60): """Run program in subprocess with timeout.""" with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp_file: script = f"""import sysimport numpy as npimport picklesys.path.insert(0, os.path.dirname('{program_path}'))spec = __import__('importlib.util').util.spec_from_file_location( "program", '{program_path}')program = __import__('importlib.util').util.module_from_spec(spec)spec.loader.exec_module(program)centers, radii, sum_radii = program.run_packing()with open('{temp_file.name}.results', 'wb') as f: pickle.dump({{ 'centers': centers, 'radii': radii, 'sum_radii': sum_radii }}, f)""" temp_file.write(script.encode()) temp_file_path = temp_file.name results_path = f"{temp_file_path}.results" try: process = subprocess.Popen( [sys.executable, temp_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) stdout, stderr = process.communicate(timeout=timeout_seconds) if process.returncode != 0: raise RuntimeError(f"Process exited with code {process.returncode}") with open(results_path, 'rb') as f: results = pickle.load(f) return results['centers'], results['radii'], results['sum_radii'] finally: if os.path.exists(temp_file_path): os.unlink(temp_file_path) if os.path.exists(results_path): os.unlink(results_path)def evaluate(program_path): """Evaluate circle packing solution.""" TARGET_VALUE = 2.635 # Best known result try: centers, radii, reported_sum = run_with_timeout(program_path, timeout_seconds=60) # Validate solution valid = validate_packing(centers, radii) sum_radii = np.sum(radii) if valid else 0.0 # Calculate metrics target_ratio = sum_radii / TARGET_VALUE if valid else 0.0 combined_score = target_ratio # 1.0 = matches best known return { "combined_score": float(combined_score), "sum_radii": float(sum_radii), "target_ratio": float(target_ratio), "validity": 1.0 if valid else 0.0, } except Exception as e: print(f"Evaluation failed: {e}") return { "combined_score": 0.0, "validity": 0.0, "error": str(e) }
# ❌ Baddef evaluate(program_path): result = run_program(program_path) if result is None: raise ValueError("Program failed") return {"combined_score": result}
Do return zero score:
# ✅ Gooddef evaluate(program_path): try: result = run_program(program_path) if result is None: return {"combined_score": 0.0, "error": "Program returned None"} return {"combined_score": result} except Exception as e: return {"combined_score": 0.0, "error": str(e)}
evaluator: llm_as_judge: trueprompt: evaluator_system_message: | You are a code quality judge. Evaluate the given code and return JSON: { "readability": 0.8, "correctness": 0.9, "efficiency": 0.7 } Each score should be between 0.0 and 1.0.llm: evaluator_models: - name: "gpt-4o-mini" weight: 1.0
The LLM judge appends llm_* metrics to your evaluator’s output:
import syssys.path.insert(0, '.')from evaluator import evaluate# Test with your initial programmetrics = evaluate("initial_program.py")print("Metrics:", metrics)assert "combined_score" in metrics, "Missing combined_score!"assert 0.0 <= metrics["combined_score"] <= 1.0, "Score out of range!"print("✓ Evaluator works!")