ScalingIntelligence
diff --git a/‎README.md‎
Lines changed: 42 additions & 1 deletion b/‎README.md‎
Lines changed: 42 additions & 1 deletion
diff --git a/‎src/kernelbench_tinker/config/configs.py‎
Lines changed: 54 additions & 0 deletions b/‎src/kernelbench_tinker/config/configs.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎src/kernelbench_tinker/config/rl_kernelbench.yaml‎
Lines changed: 31 additions & 0 deletions b/‎src/kernelbench_tinker/config/rl_kernelbench.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/kernelbench_tinker/envs/kernelbench_client.py‎
Lines changed: 35 additions & 0 deletions b/‎src/kernelbench_tinker/envs/kernelbench_client.py‎
Lines changed: 35 additions & 0 deletions
@@ -31,6 +31,47 @@ We then extend `KernelBenchEnv` to support:
 - **Batching**: `KernelBenchEnvGroupBuilder` groups multiple rollouts for the same problem, enabling **GRPO-style** training where rewards are normalized within groups.
 - **Dataset Construction**: `KernelBenchDatasetBuilder` handles the iteration over KernelBench levels and problems, partitioning them into training and evaluation sets. You are welcome to extend it to support more problems beyond what is currently in KernelBench.
 
+### Multi-Turn RL
+
+We extend the single-turn pipeline with multi-turn iterative refinement, following the approach in [Kevin](https://arxiv.org/abs/2507.11948). Instead of generating one kernel per problem, the model generates a kernel, receives evaluation feedback (compilation errors, correctness failures, or speedup results), and refines its solution over multiple turns.
+
+`MultiTurnKernelBenchEnv` manages the multi-turn loop:
+- **History management**: Prior turns (prompt, response, feedback) are kept in context with token-based truncation to stay within the context window.
+- **Evaluation feedback**: Structured feedback tells the model what went wrong (compilation error, incorrect output, or correct but slow) so it can fix specific issues.
+- **Early stopping**: Optionally stop the episode when the kernel passes all correctness tests.
+
+Training uses GRPO with discounted returns across turns:
+- Per-turn scores are computed as `S = 0.3 * correct + speedup` (only for correct kernels).
+- Discounted returns: `R_t = S_t + γ * R_{t+1}` (backward recursion, γ=0.4 by default).
+- Advantages are normalized across all `group_size × max_turns` turn-level samples: `(R - mean) / (std + ε)`.
+- PPO with asymmetric clipping (Clip-Higher, ε_low=0.2, ε_high=0.28) and constant length normalization.
+
+Enable multi-turn via config:
+```yaml
+multiturn:
+    enabled: true
+    max_turns: 4      # Refinement turns per trajectory
+    gamma: 0.4        # Discount factor
+    aggregation: "sum" # "sum" or "max"
+```
+
+Or via CLI:
+```bash
+uv run python -m kernelbench_tinker.scripts.train_kernel_rl \
+    --config src/kernelbench_tinker/config/rl_kernelbench.yaml \
+    multiturn.enabled=true \
+    log_path=./runs/my_multiturn_experiment
+```
+
+Multi-turn inference is also supported via the eval script:
+```bash
+uv run python -m kernelbench_tinker.scripts.eval_kernel_rl \
+    checkpoint_path=<your_checkpoint> \
+    multiturn_enabled=true \
+    multiturn_max_turns=8 \
+    level=1
+```
+
 
 ### Directory Structure
 ```text
@@ -54,6 +95,7 @@ src/kernelbench_tinker/
   envs/
     kernelbench_client.py       # KernelBench Python API wrapper
     kernelbench_env.py          # Single-turn RL environment
+    multiturn_kernelbench_env.py # Multi-turn RL environment
   training/
     models.py                   # Model/renderer configuration
     reward.py                   # Reward shaping
@@ -282,7 +324,6 @@ Note the scope of this repo is an open-source implementation of KernelBench-Tink
 
 * More reward examples leveraging more fine-grained metrics
 * More reward hack checking
-* Multi-turn RL to have denser reward signal like [Kevin](https://arxiv.org/abs/2507.11948)
 * Improve Step time and training efficiency
 
 
 
@@ -81,3 +81,57 @@ class DatasetConfig:
 
     # Train/test split
     test_fraction: float = 0.1
+
+
+@dataclass
+class MultiTurnConfig:
+    """
+    Configuration for multi-turn RL training.
+
+    Controls the iterative refinement loop where the model receives
+    evaluation feedback and can fix errors across multiple turns.
+    """
+
+    # Enable multi-turn mode (False = single-turn)
+    enabled: bool = False
+
+    # Maximum refinement turns per trajectory
+    max_turns: int = 4
+
+    # Discount factor for multi-turn returns: R_t = S_t + gamma * R_{t+1}
+    gamma: float = 0.4
+
+    # Return aggregation mode: "sum" or "max"
+    #   sum: R_t = Σ γ^(i-t) × S_i  (reward turns leading to many good kernels)
+    #   max: R_t = max{ γ^(i-t) × S_i } (reward turns leading to one great kernel)
+    aggregation: str = "sum"
+
+    # Stop the episode early when the kernel is correct.
+    # Default False for training: model needs post-correctness turns to
+    # learn speedup optimization.  Set True at eval time if desired.
+    early_stop_on_correct: bool = False
+
+    # Optional: require this speedup before early stopping
+    speedup_threshold: float | None = None
+
+    # Prompt
+    prompt_max_tokens: int | None = None  # Token budget for history truncation (None = char fallback)
+    inject_think_token: bool = False  # Append <think>\n to generation prompts
+
+    # Generation
+    temperature: float = 0.9
+    top_p: float = 1.0
+    seed: int | None = None
+
+    # Response length extension mid-training (0 = disabled)
+    max_tokens_extended: int = 22000
+    max_tokens_extend_after_step: int = 30
+
+    # Training
+    loss_fn: str = "ppo"
+    max_grad_norm: float = 0.05
+    warmup_ratio: float = 0.03
+    clip_epsilon_low: float = 0.2
+    clip_epsilon_high: float = 0.28
+    constant_length_norm: int = 16384
+    num_substeps: int = 2
@@ -26,6 +26,33 @@ learning_rate: 0.000002  # 2e-6 as explicit float
 max_tokens: 16384
 temperature: 1.0
 
+# =============================================================================
+# Multi-turn Configuration (disabled by default)
+# =============================================================================
+multiturn:
+    enabled: false                     # true to enable iterative refinement
+    max_turns: 4                       # Maximum refinement turns per trajectory
+    gamma: 0.4                         # Discount factor for multi-turn returns
+    aggregation: "sum"                 # "sum" (reward many good kernels) or "max" (reward one great kernel)
+    early_stop_on_correct: false       # Stop episode when kernel passes all tests
+    speedup_threshold: null            # Required speedup before early stopping (null = any correct)
+    # Prompt
+    prompt_max_tokens: null            # Token budget for history truncation (null = char fallback)
+    inject_think_token: false          # Append <think>\n to generation prompts
+    # Generation
+    temperature: 0.9                   # Generation temperature
+    top_p: 1.0                         # Nucleus sampling (1.0 = disabled)
+    seed: null                         # Random seed for generation (null = random)
+    max_tokens_extended: 22000         # Extend max_tokens mid-training (0 = disabled)
+    max_tokens_extend_after_step: 30   # Step at which to switch
+    # Training
+    loss_fn: "ppo"                     # Loss function (single-turn uses top-level loss_fn)
+    max_grad_norm: 0.05                # Gradient clipping (0.0 = disabled)
+    warmup_ratio: 0.03                 # Linear LR warmup fraction
+    clip_epsilon_low: 0.2              # PPO clip lower bound
+    clip_epsilon_high: 0.28            # PPO clip upper bound (Clip-High)
+    constant_length_norm: 16384        # GRPO constant length normalization (0 = disabled)
+
 # =============================================================================
 # Training Configuration
 # =============================================================================
@@ -57,6 +84,7 @@ dataset_builder:
     # Problem Selection
     # ---------------------------------------------------------------------------
     level: 1                      # KernelBench level (1, 2, 3, or 4)
+    levels: null                  # Train on multiple levels (e.g. [1, 2]); overrides level when set
     start_problem: null           # First problem ID (null = start from 1)
     end_problem: null             # Last problem ID (null = all problems)
     dataset_src: "huggingface"    # "huggingface" or "local"
@@ -107,6 +135,9 @@ dataset_builder:
     reward_correctness_weight: 0.3
     reward_speed_weight: 1.0
     reward_length_weight: 0.0
+    reward_speed_max_reward: 10.0     # Cap on speed reward component (set high to uncap)
+    reward_clip_min: null             # Lower bound on total reward (null = no clipping)
+    reward_clip_max: null             # Upper bound on total reward (null = no clipping)
 
     # ---------------------------------------------------------------------------
     # Reward Hacking Detection (Static Checker)
 
@@ -33,11 +33,18 @@
     re.DOTALL | re.IGNORECASE
 )
 
+# Summary block pattern - reasoning summary inside <SUMMARY>...</SUMMARY>
+SUMMARY_BLOCK_PATTERN = re.compile(
+    r"<SUMMARY>(.*?)</SUMMARY>",
+    re.DOTALL | re.IGNORECASE
+)
+
 
 @dataclass
 class ParsedResponse:
     """Parsed model response with kernel blocks."""
     kernel: str   # Kernel code (from <KERNEL> block or extracted code block)
+    cot_summary: str  # Reasoning summary (from <SUMMARY> block)
     raw: str      # Original raw response
     format_ok: bool  # Whether we successfully extracted kernel code
 
@@ -94,8 +101,15 @@ def parse_structured_response(text: str) -> ParsedResponse:
     # Check if we got valid kernel code
     format_ok = bool(kernel) and ("class ModelNew" in kernel or "def forward" in kernel)
 
+    # Extract CoT summary from <SUMMARY> block
+    cot_summary = ""
+    summary_match = SUMMARY_BLOCK_PATTERN.search(text)
+    if summary_match:
+        cot_summary = summary_match.group(1).strip()
+
     return ParsedResponse(
         kernel=kernel,
+        cot_summary=cot_summary,
         raw=raw,
         format_ok=format_ok,
     )
@@ -487,6 +501,7 @@ class KernelBenchProblem:
     prompt_gpu_name: str | None = None
 
     _prompt: str | None = field(default=None, repr=False)
+    _base_prompt: str | None = field(default=None, repr=False)
 
     @property
     def prompt(self) -> str:
@@ -504,3 +519,23 @@ def prompt(self) -> str:
             )
         return self._prompt
 
+    @property
+    def base_prompt(self) -> str:
+        """Get the zero-shot prompt (no examples) for refinement turns.
+
+        In multi-turn training, the one-shot example is included only on the
+        first turn.  Subsequent turns use this stripped-down prompt to save
+        context tokens.
+        """
+        if self._base_prompt is None:
+            self._base_prompt = get_prompt_for_problem(
+                self.level,
+                self.problem_id,
+                self.backend,
+                option="zero_shot",
+                dataset_src=self.dataset_src,
+                precision=self.prompt_precision,
+                include_hardware=self.prompt_include_hardware,
+                gpu_name=self.prompt_gpu_name,
+            )
+        return self._base_prompt