meta-pytorch · burtenshaw · Nov 7, 2025
diff --git a/examples/coding_env_inference.py b/examples/coding_env_inference.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""Solve a coding task with a hosted LLM via Hugging Face Inference.
+
+This script mirrors ``textarena_wordle_inference.py`` but targets the Coding
+environment. It launches the CodingEnv Docker image locally and asks an
+OpenAI-compatible model served through Hugging Face's router to iteratively
+produce Python code until the task is solved.
+
+Prerequisites
+-------------
+1. Build the Coding environment Docker image::
+
+       docker build \
+           -f src/envs/coding_env/server/Dockerfile \
+           -t coding-env:latest .
+
+2. Set your Hugging Face token, or any other API key that is compatible with the OpenAI API:
+
+       export HF_TOKEN=your_token_here
+       export API_KEY=your_api_key_here
+
+3. Run the script::
+
+       python examples/coding_env_inference.py
+
+The script keeps sending execution feedback to the model until it prints
+``Result: 338350`` or reaches the configured step limit.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+from typing import List, Tuple
+
+from openai import OpenAI
+
+from envs.coding_env import CodeAction, CodingEnv
+
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+API_BASE_URL = "https://router.huggingface.co/v1"
+API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN")
+
+MODEL = "openai/gpt-oss-120b:novita"
+MAX_STEPS = 5
+VERBOSE = True
+
+CODING_TASK = (
+    "Write Python code that prints the sum of squares of the integers from 1 "
+    "to 100 inclusive. The final line must be exactly `Result: <value>` with "
+    "the correct number substituted."
+)
+EXPECTED_SUBSTRING = "Result: 338350"
+
+SYSTEM_PROMPT = (
+    "You are an expert Python programmer. Respond with valid Python code that "
+    "solves the user's task. Always wrap your final answer in a fenced code "
+    "block starting with ```python. Provide a complete script that can be "
+    "executed as-is, with no commentary outside the code block."
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def extract_python_code(text: str) -> str:
+    """Extract the first Python code block from the model output."""
+
+    code_blocks = re.findall(
+        r"```(?:python)?\s*(.*?)```",
+        text,
+        re.IGNORECASE | re.DOTALL,
+    )
+    if code_blocks:
+        return code_blocks[0].strip()
+    return text.strip()
+
+
+def format_feedback(
+    step: int,
+    stdout: str,
+    stderr: str,
+    exit_code: int,
+) -> str:
+    """Generate feedback text describing the previous execution."""
+
+    stdout_display = stdout if stdout.strip() else "<empty>"
+    stderr_display = stderr if stderr.strip() else "<empty>"
+    return (
+        f"Execution feedback for step {step}:\n"
+        f"exit_code={exit_code}\n"
+        f"stdout:\n{stdout_display}\n"
+        f"stderr:\n{stderr_display}\n"
+        "If the task is not solved, return an improved Python script."
+    )
+
+
+def build_initial_prompt(task: str) -> str:
+    """Construct the first user prompt for the coding task."""
+
+    return (
+        "You must write Python code to satisfy the following task. "
+        "When executed, your script should behave exactly as described.\n\n"
+        f"Task:\n{task}\n\n"
+        "Reply with the full script in a single ```python code block."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Gameplay
+# ---------------------------------------------------------------------------
+
+def solve_coding_task(
+    env: CodingEnv,
+    client: OpenAI,
+) -> Tuple[bool, List[str]]:
+    """Iteratively ask the model for code until the task is solved."""
+
+    history = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": build_initial_prompt(CODING_TASK)},
+    ]
+
+    obs = env.reset().observation
+
+    transcripts: List[str] = []
+
+    for step in range(1, MAX_STEPS + 1):
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=history,
+            max_tokens=2048,
+            temperature=0.2,
+        )
+
+        assistant_message = response.choices[0].message.content.strip()
+        history.append({"role": "assistant", "content": assistant_message})
+
+        code = extract_python_code(assistant_message)
+
+        if VERBOSE:
+            print(f"\n🛠️  Step {step}: executing model-produced code")
+            print(code)
+
+        result = env.step(CodeAction(code=code))
+        obs = result.observation
+
+        transcripts.append(
+            (
+                f"Step {step} | exit_code={obs.exit_code}\n"
+                f"stdout:\n{obs.stdout}\n"
+                f"stderr:\n{obs.stderr}\n"
+            )
+        )
+
+        if VERBOSE:
+            print("   ▶ exit_code:", obs.exit_code)
+            if obs.stdout:
+                print("   ▶ stdout:\n" + obs.stdout)
+            if obs.stderr:
+                print("   ▶ stderr:\n" + obs.stderr)
+
+        solved = obs.exit_code == 0 and EXPECTED_SUBSTRING in obs.stdout
+        if solved:
+            return True, transcripts
+
+        history.append(
+            {
+                "role": "user",
+                "content": format_feedback(
+                    step,
+                    obs.stdout,
+                    obs.stderr,
+                    obs.exit_code,
+                ),
+            }
+        )
+
+        # Keep conversation history compact to avoid exceeding context limits
+        if len(history) > 20:
+            history = [history[0]] + history[-19:]
+
+    return False, transcripts
+
+
+# ---------------------------------------------------------------------------
+# Entrypoint
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    if not API_KEY:
+        raise SystemExit(
+            "HF_TOKEN (or API_KEY) must be set to query the model."
+        )
+
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+
+    env = CodingEnv.from_docker_image(
+        "coding-env:latest",
+        ports={8000: 8000},
+    )
+
+    try:
+        success, transcripts = solve_coding_task(env, client)
+    finally:
+        env.close()
+
+    print(
+        "\n✅ Session complete"
+        if success
+        else "\n⚠️ Session finished without solving the task"
+    )
+    print("--- Execution transcripts ---")
+    for entry in transcripts:
+        print(entry)
+
+
+if __name__ == "__main__":
+    main()
+
+