p-e-w · KewkLW · Mar 4, 2026 · Mar 4, 2026 · gemini-code-assist · Mar 4, 2026
diff --git a/src/heretic/config.py b/src/heretic/config.py
@@ -215,6 +215,16 @@ class Settings(BaseSettings):
         ),
     )
 
+    kl_tokens: int = Field(
+        default=1,
+        description=(
+            "Number of tokens to generate when computing KL divergence. "
+            "Higher values give a more robust quality signal at the cost of slower evaluation. "
+            "The KL divergence is averaged across all token positions. "
+            "Recommended: 1 (fastest, default), 3-5 (good tradeoff), >5 (diminishing returns)."
+        ),
+    )
+
     n_trials: int = Field(
         default=200,
         description="Number of abliteration trials to run during optimization.",

diff --git a/src/heretic/evaluator.py b/src/heretic/evaluator.py
@@ -28,7 +28,12 @@ def __init__(self, settings: Settings, model: Model):
         self.good_prompts = load_prompts(settings, settings.good_evaluation_prompts)
         print(f"* [bold]{len(self.good_prompts)}[/] prompts loaded")
 
-        print("* Obtaining first-token probability distributions...")
+        kl_label = (
+            f"* Obtaining {settings.kl_tokens}-token probability distributions..."
+            if settings.kl_tokens > 1
+            else "* Obtaining first-token probability distributions..."
+        )
+        print(kl_label)
         self.base_logprobs = model.get_logprobs_batched(self.good_prompts)
 
         print()
@@ -93,7 +98,12 @@ def count_refusals(self) -> int:
         return refusal_count
 
     def get_score(self) -> tuple[tuple[float, float], float, int]:
-        print("  * Obtaining first-token probability distributions...")
+        kl_label = (
+            f"  * Obtaining {self.settings.kl_tokens}-token probability distributions..."
+            if self.settings.kl_tokens > 1
+            else "  * Obtaining first-token probability distributions..."
+        )
+        print(kl_label)
         logprobs = self.model.get_logprobs_batched(self.good_prompts)
         kl_divergence = F.kl_div(
             logprobs,
@@ -107,8 +117,11 @@ def get_score(self) -> tuple[tuple[float, float], float, int]:
         refusals = self.count_refusals()
         print(f"  * Refusals: [bold]{refusals}[/]/{len(self.bad_prompts)}")
 
-        kl_divergence_scale = self.settings.kl_divergence_scale
-        kl_divergence_target = self.settings.kl_divergence_target
+        # Scale thresholds by kl_tokens since multi-token KL produces
+        # proportionally larger absolute values.
+        kl_tokens = self.settings.kl_tokens
+        kl_divergence_scale = self.settings.kl_divergence_scale * kl_tokens
+        kl_divergence_target = self.settings.kl_divergence_target * kl_tokens
-        # Scale thresholds by kl_tokens since multi-token KL produces
-        # proportionally larger absolute values.
-        kl_tokens = self.settings.kl_tokens
-        kl_divergence_scale = self.settings.kl_divergence_scale * kl_tokens
-        kl_divergence_target = self.settings.kl_divergence_target * kl_tokens
+        kl_divergence_scale = self.settings.kl_divergence_scale
+        kl_divergence_target = self.settings.kl_divergence_target
-        # Scale thresholds by kl_tokens since multi-token KL produces
-        # proportionally larger absolute values.
-        kl_tokens = self.settings.kl_tokens
-        kl_divergence_scale = self.settings.kl_divergence_scale * kl_tokens
-        kl_divergence_target = self.settings.kl_divergence_target * kl_tokens
+        kl_divergence_scale = self.settings.kl_divergence_scale
+        kl_divergence_target = self.settings.kl_divergence_target
 
         refusals_score = refusals / self.base_refusals
 

diff --git a/src/heretic/model.py b/src/heretic/model.py
@@ -656,25 +656,28 @@ def get_residuals_batched(self, prompts: list[Prompt]) -> Tensor:
     # We work with logprobs rather than probabilities for numerical stability
     # when computing the KL divergence.
     def get_logprobs(self, prompts: list[Prompt]) -> Tensor:
-        # We only generate one token, and we return the (log) probability distributions
-        # over the vocabulary at that token position, for each prompt.
+        n_tokens = self.settings.kl_tokens
+
         _, outputs = self.generate(
             prompts,
-            max_new_tokens=1,
+            max_new_tokens=n_tokens,
             output_scores=True,
             return_dict_in_generate=True,
         )
 
-        # This cast is valid because GenerateDecoderOnlyOutput is the return type
-        # of model.generate with return_dict_in_generate=True.
         outputs = cast(GenerateDecoderOnlyOutput, outputs)
+        scores = cast(tuple[FloatTensor], outputs.scores)
+
+        if n_tokens == 1:
+            # Original single-token path: shape (prompt, vocab).
+            return F.log_softmax(scores[0], dim=-1)
 
-        # Logits for the first (only) generated token.
-        # This cast is valid because we passed output_scores=True above.
-        logits = cast(tuple[FloatTensor], outputs.scores)[0]
+        # Multi-token: stack all positions, reshape to (prompt * n_tokens, vocab)
+        # so KL div with batchmean naturally averages across all positions.
-        # Multi-token: stack all positions, reshape to (prompt * n_tokens, vocab)
-        # so KL div with batchmean naturally averages across all positions.
+        # Multi-token: stack all positions and reshape to (prompt * n_tokens, vocab).
+        # This allows `F.kl_div` with `reduction="batchmean"` to average the KL divergence across all positions.
-        # Multi-token: stack all positions, reshape to (prompt * n_tokens, vocab)
-        # so KL div with batchmean naturally averages across all positions.
+        # Multi-token: stack all positions and reshape to (prompt * n_tokens, vocab).
+        # This allows `F.kl_div` with `reduction="batchmean"` to average the KL divergence across all positions.
+        all_logits = torch.stack(list(scores), dim=0)  # (n_tokens, prompt, vocab)
+        all_logits = all_logits.permute(1, 0, 2).reshape(-1, all_logits.shape[-1])
 
-        # The returned tensor has shape (prompt, token).
-        return F.log_softmax(logits, dim=-1)
+        return F.log_softmax(all_logits, dim=-1)
 
     def get_logprobs_batched(self, prompts: list[Prompt]) -> Tensor:
         logprobs = []