From 6e5bdda18081b7ca6478d65635955c0c4d3a45e0 Mon Sep 17 00:00:00 2001
From: bb-connor <bb-connor@users.noreply.github.com>
Date: Fri, 8 May 2026 22:13:50 -0400
Subject: [PATCH 1/2] Phase 1.4 (c): GraphitiHttpRouter + kb-engine optional
 deps + compose image fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(c) GraphitiHttpRouter — production Graphiti writer.

  kb_engine/sync.py: new GraphitiHttpRouter class. POSTs records with
  target="graphiti" to a graphiti-mcp HTTP endpoint as MCP JSON-RPC
  `tools/call` envelopes (default tool: `add_memory`). Records with
  other targets are skipped. Failures are captured in router.failures
  by default; --strict raises. The HTTP `post` callable is dependency-
  injected so tests use a fake response without httpx installed.

  Per AGENTS.md hard rule #1: this is the **designated** Graphiti writer.
  Plugin code never POSTs Graphiti directly — handlers produce
  DerivedRecord(target="graphiti", ...), the daemon's router list
  includes a GraphitiHttpRouter (alongside JsonlRouter + NullRouter as
  needed), and only the daemon's router runs the HTTP write.

  +7 tests covering: posts an episode (verifies envelope shape), skips
  non-graphiti records, handles HTTP errors non-strict (records to
  failures), strict mode raises, network exception non-strict, falls
  back to frontmatter for episode_body when no explicit body provided,
  increments JSON-RPC id per call. kb-engine total: 49 tests.

(infra) Optional-dependency extras for kb-engine:
    [project.optional-dependencies]
    postgres = ["psycopg[binary]>=3.1", "pgvector>=0.2"]
    neo4j    = ["neo4j>=5"]
    openai   = ["openai>=1.0"]
    watch    = ["watchdog>=4"]
    yaml     = ["pyyaml>=6.0"]
    http     = ["httpx>=0.27"]
    all      = [<every backend>]
  chio-pack now declares `kb-engine[all]` so `uv sync` brings every
  store driver in. The lazy-import pattern in store/embed.py /
  store/postgres.py / store/neo4j.py / sync.py still works without
  the extras for environments that don't need a particular backend.

(infra) docker-compose.yml fix:
  graphiti-mcp image was guessed `zepai/graphiti-mcp:latest` (doesn't
  exist on Docker Hub). Verified via `docker ps` against the running
  PR #599 instance: actual image is
  `zepai/knowledge-graph-mcp:standalone`. Updated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 chio-pack/pyproject.toml     |   2 +-
 infra/docker-compose.yml     |   6 +-
 kb-engine/kb_engine/sync.py  |  90 ++++++++++++++++++++++++++
 kb-engine/pyproject.toml     |  25 +++++++-
 kb-engine/tests/test_sync.py | 119 +++++++++++++++++++++++++++++++++++
 5 files changed, 238 insertions(+), 4 deletions(-)
diff --git a/chio-pack/pyproject.toml b/chio-pack/pyproject.toml
index 530ae0d..b9a7afb 100644
--- a/chio-pack/pyproject.toml
+++ b/chio-pack/pyproject.toml
@@ -6,7 +6,7 @@ requires-python = ">=3.11"
 authors = [{ name = "Backbay" }]
 dependencies = [
     "pyyaml>=6.0",
-    "kb-engine",
+    "kb-engine[all]",
     "click>=8.1",
 ]
 
diff --git a/infra/docker-compose.yml b/infra/docker-compose.yml
index d6d7b51..d4d3f8e 100644
--- a/infra/docker-compose.yml
+++ b/infra/docker-compose.yml
@@ -53,7 +53,11 @@ services:
       retries: 12
 
   graphiti-mcp:
-    image: zepai/graphiti-mcp:latest
+    # Real image used by arc PR #599's stack. The earlier docker-compose
+    # in this repo guessed `zepai/graphiti-mcp:latest` which doesn't
+    # exist on Docker Hub. Verified via `docker ps` against the running
+    # PR-599 instance (commit 4c08f93).
+    image: zepai/knowledge-graph-mcp:standalone
     container_name: chio-kb-graphiti
     depends_on:
       kb-neo4j:
diff --git a/kb-engine/kb_engine/sync.py b/kb-engine/kb_engine/sync.py
index ce1808a..6de7e05 100644
--- a/kb-engine/kb_engine/sync.py
+++ b/kb-engine/kb_engine/sync.py
@@ -88,6 +88,96 @@ def write(self, records: Iterable[DerivedRecord]) -> int:
         return n
 
 
+class GraphitiHttpRouter:
+    """POSTs Graphiti-target records to a graphiti-mcp HTTP endpoint.
+
+    Per AGENTS.md hard rule #1, this is the designated Graphiti writer.
+    Plugin code never writes Graphiti directly — it produces
+    DerivedRecord(target="graphiti", ...) objects, which the daemon
+    routes through this class.
+
+    Records with target != "graphiti" are skipped. Each Graphiti record
+    becomes one MCP JSON-RPC `tools/call` envelope, by default invoking
+    the `add_memory` tool. Constructor takes the MCP URL plus optional
+    overrides (tool_name, timeout, strict). For tests the underlying
+    HTTP `post` callable is dependency-injected so no real httpx is
+    needed.
+    """
+
+    DEFAULT_TOOL = "add_memory"
+
+    def __init__(
+        self,
+        url: str,
+        *,
+        tool_name: str = DEFAULT_TOOL,
+        timeout_seconds: float = 30.0,
+        strict: bool = False,
+        post: Any = None,
+    ) -> None:
+        self.url = url
+        self.tool_name = tool_name
+        self.timeout_seconds = timeout_seconds
+        self.strict = strict
+        self._post = post
+        self._next_id = 1
+        self.failures: list[tuple[DerivedRecord, str]] = []
+
+    def _default_post(self, url: str, json: dict[str, Any]) -> Any:
+        try:
+            import httpx  # type: ignore
+        except ImportError as e:
+            raise RuntimeError(
+                "httpx not installed. `pip install kb-engine[http]` "
+                "or pass a custom `post` callable."
+            ) from e
+        return httpx.post(url, json=json, timeout=self.timeout_seconds)
+
+    def _build_envelope(self, record: DerivedRecord) -> dict[str, Any]:
+        rpc_id = self._next_id
+        self._next_id += 1
+        payload = record.payload
+        arguments: dict[str, Any] = {
+            "name": payload.get("name", ""),
+            "episode_body": payload.get("episode_body")
+                or json.dumps(payload.get("frontmatter", payload), sort_keys=True),
+            "source_description": payload.get("source_description", ""),
+            "source": payload.get("source", "json"),
+        }
+        if "group_id" in payload:
+            arguments["group_id"] = payload["group_id"]
+        return {
+            "jsonrpc": "2.0",
+            "id": rpc_id,
+            "method": "tools/call",
+            "params": {"name": self.tool_name, "arguments": arguments},
+        }
+
+    def write(self, records: Iterable[DerivedRecord]) -> int:
+        post = self._post or self._default_post
+        n = 0
+        for r in records:
+            if r.target != "graphiti":
+                continue
+            envelope = self._build_envelope(r)
+            try:
+                response = post(self.url, json=envelope)
+            except Exception as e:
+                self.failures.append((r, str(e)))
+                if self.strict:
+                    raise
+                continue
+            status = getattr(response, "status_code", None)
+            if status is not None and status >= 400:
+                msg = f"HTTP {status}: {getattr(response, 'text', '')[:200]}"
+                self.failures.append((r, msg))
+                if self.strict:
+                    raise RuntimeError(f"Graphiti POST failed: {msg}")
+                continue
+            n += 1
+        return n
+
+
 # === Frontmatter parsing ===
 
 
diff --git a/kb-engine/pyproject.toml b/kb-engine/pyproject.toml
index 88b591a..cdb8b98 100644
--- a/kb-engine/pyproject.toml
+++ b/kb-engine/pyproject.toml
@@ -1,10 +1,31 @@
 [project]
 name = "kb-engine"
-version = "0.1.0"
+version = "0.2.0"
 description = "Generic retrieval / graph / MCP framework for chio-developer-base. Domain-agnostic."
 requires-python = ">=3.11"
 authors = [{ name = "Backbay" }]
-dependencies = []
+dependencies = [
+    # Core deps are stdlib-only. Backing-store drivers below are optional
+    # extras so the engine remains usable in environments that don't need
+    # them (tests, offline dev). Production installs use kb-engine[all].
+]
+
+[project.optional-dependencies]
+postgres = ["psycopg[binary]>=3.1", "pgvector>=0.2"]
+neo4j = ["neo4j>=5"]
+openai = ["openai>=1.0"]
+watch = ["watchdog>=4"]
+yaml = ["pyyaml>=6.0"]
+http = ["httpx>=0.27"]
+all = [
+    "psycopg[binary]>=3.1",
+    "pgvector>=0.2",
+    "neo4j>=5",
+    "openai>=1.0",
+    "watchdog>=4",
+    "pyyaml>=6.0",
+    "httpx>=0.27",
+]
 
 [dependency-groups]
 dev = [
diff --git a/kb-engine/tests/test_sync.py b/kb-engine/tests/test_sync.py
index 6270fdf..70d7569 100644
--- a/kb-engine/tests/test_sync.py
+++ b/kb-engine/tests/test_sync.py
@@ -7,6 +7,7 @@
 
 from kb_engine import DerivedRecord, Registry
 from kb_engine.sync import (
+    GraphitiHttpRouter,
     JsonlRouter,
     NullRouter,
     SyncState,
@@ -228,3 +229,121 @@ def test_content_hash_changes_with_content():
     b = _content_hash("hello world")
     assert a != b
     assert a.startswith("sha256:")
+
+
+# === GraphitiHttpRouter ===
+
+
+class _FakeResponse:
+    def __init__(self, status_code: int = 200, text: str = '{"result": {}}'):
+        self.status_code = status_code
+        self.text = text
+
+
+def test_graphiti_router_posts_episode():
+    sent: list = []
+
+    def fake_post(url, json):
+        sent.append((url, json))
+        return _FakeResponse(200)
+
+    router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post)
+    n = router.write([
+        DerivedRecord(target="graphiti", payload={
+            "name": "Test episode",
+            "source_description": "Test seed",
+            "frontmatter": {"id": "ep.test"},
+        })
+    ])
+    assert n == 1
+    url, envelope = sent[0]
+    assert url == "http://localhost:8000/mcp"
+    assert envelope["jsonrpc"] == "2.0"
+    assert envelope["method"] == "tools/call"
+    assert envelope["params"]["name"] == "add_memory"
+    assert envelope["params"]["arguments"]["name"] == "Test episode"
+
+
+def test_graphiti_router_skips_non_graphiti_records():
+    sent: list = []
+
+    def fake_post(url, json):
+        sent.append((url, json))
+        return _FakeResponse(200)
+
+    router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post)
+    records = [
+        DerivedRecord(target="neo4j", payload={"id": "x"}),
+        DerivedRecord(target="graphiti", payload={"name": "Y"}),
+        DerivedRecord(target="audit-log", payload={}),
+    ]
+    n = router.write(records)
+    assert n == 1
+    assert len(sent) == 1
+
+
+def test_graphiti_router_handles_http_errors_non_strict():
+    def fake_post(url, json):
+        return _FakeResponse(500, text="server error")
+
+    router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post)
+    n = router.write([DerivedRecord(target="graphiti", payload={"name": "X"})])
+    assert n == 0
+    assert len(router.failures) == 1
+    assert "HTTP 500" in router.failures[0][1]
+
+
+def test_graphiti_router_strict_raises_on_http_error():
+    def fake_post(url, json):
+        return _FakeResponse(500, text="boom")
+
+    router = GraphitiHttpRouter(
+        "http://localhost:8000/mcp", post=fake_post, strict=True
+    )
+    with pytest.raises(RuntimeError, match="Graphiti POST failed"):
+        router.write([DerivedRecord(target="graphiti", payload={"name": "X"})])
+
+
+def test_graphiti_router_handles_post_exception_non_strict():
+    def fake_post(url, json):
+        raise ConnectionError("network down")
+
+    router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post)
+    n = router.write([DerivedRecord(target="graphiti", payload={"name": "X"})])
+    assert n == 0
+    assert "network down" in router.failures[0][1]
+
+
+def test_graphiti_router_falls_back_to_frontmatter_for_episode_body():
+    sent: list = []
+
+    def fake_post(url, json):
+        sent.append((url, json))
+        return _FakeResponse(200)
+
+    router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post)
+    router.write([
+        DerivedRecord(target="graphiti", payload={
+            "name": "Y",
+            "frontmatter": {"id": "y", "type": "episode-architecture-summary"},
+        })
+    ])
+    body = sent[0][1]["params"]["arguments"]["episode_body"]
+    assert "episode-architecture-summary" in body
+
+
+def test_graphiti_router_increments_jsonrpc_id_per_call():
+    sent: list = []
+
+    def fake_post(url, json):
+        sent.append(json)
+        return _FakeResponse(200)
+
+    router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post)
+    router.write([
+        DerivedRecord(target="graphiti", payload={"name": "A"}),
+        DerivedRecord(target="graphiti", payload={"name": "B"}),
+    ])
+    ids = [e["id"] for e in sent]
+    assert len(ids) == 2
+    assert ids[0] != ids[1]

From 0a94ef8b22c74dc99c47881d3787faca5c63b22f Mon Sep 17 00:00:00 2001
From: bb-connor <bb-connor@users.noreply.github.com>
Date: Fri, 8 May 2026 22:19:15 -0400
Subject: [PATCH 2/2] Phase 0 (b): Run-0 calibration harness for
 cap-error-explanation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(b) Calibration harness skeleton per ADR-0004. Three rater abstractions:

  - HumanRater: interactive stdin prompt, prints scenario + augmentation
    body to stderr, reads four 1–5 scores. rater-A is the human (@connor).
  - AnthropicRater: lazy-imports the anthropic SDK, prompts a model
    with one of two system rubrics, parses a JSON object
    {clarity, accuracy, actionability, brevity}. rater-B and rater-C
    use this with claude-sonnet-4-6 (canonical) and
    claude-haiku-4-5-20251001 (accuracy-emphasis variant).
  - DeterministicRater: returns configured fake scores; used by tests
    and by `--dry-run` so the harness can be exercised without an
    API key or human in the loop.

  Two system rubrics live in this module:
  CANONICAL_RUBRIC_SYSTEM (rater-A and rater-B) and
  ACCURACY_EMPHASIS_SYSTEM (rater-C, deliberately diverged per
  ADR-0004 to surface accuracy/brevity trade-offs at calibration time).

  Loads scenarios from chio-pack/eval/fixtures/cap-error-explanation/
  via PyYAML; picks the augmentation by name from the scenario's
  `augmentations:` list. `calibrate(scenario_path, pool, ...)` returns
  a CalibrationRun (dataclass) with one RaterScore per rater plus a
  `disagreement_flags()` helper that returns dimensions where
  max - min > 1 (strict; a diff of 1 does NOT flag — verified by test).

  `render_calibration_md(run)` produces the 12-row table
  (3 raters × 4 dimensions) the calibration ADR template expects.

  CLI: `python -m chio_pack.eval.calibration --dry-run --run-number 0`
  walks all 10 scenarios with deterministic raters and prints JSON.
  Real runs use --real --augmentation raw|enriched|baseline.

  +8 tests in tests/test_calibration.py covering pool size, scenario
  loading, unknown-augmentation raises, disagreement flag detection
  (>1 strict, =1 not flagged), render produces 12 rows, RaterScore.mean(),
  DeterministicRater determinism. chio-pack total: 39 → 47 tests.

This is the harness only. Running Run-0 against the full 10 scenarios
with three real raters is gated on ANTHROPIC_API_KEY (rater-B,
rater-C) plus a sit-down with @connor (rater-A); both are out of band
for this commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 chio-pack/chio_pack/eval/calibration.py | 391 ++++++++++++++++++++++++
 chio-pack/tests/test_calibration.py     | 102 +++++++
 2 files changed, 493 insertions(+)
 create mode 100644 chio-pack/chio_pack/eval/calibration.py
 create mode 100644 chio-pack/tests/test_calibration.py

diff --git a/chio-pack/chio_pack/eval/calibration.py b/chio-pack/chio_pack/eval/calibration.py
new file mode 100644
index 0000000..b4d5396
--- /dev/null
+++ b/chio-pack/chio_pack/eval/calibration.py
@@ -0,0 +1,391 @@
+"""Run-0 calibration harness for the cap-error-explanation eval (Eval 4).
+
+Per RATERS.md "Calibration cadence": one shared scenario rated by all
+three raters; scores discussed; full scoring resumes. Run-0 is the
+inaugural calibration.
+
+This module:
+  - Loads a cap-error scenario YAML.
+  - For each of 3 raters (A / B / C per ADR-0004), prompts the rater
+    to score the four rubric dimensions (clarity, accuracy,
+    actionability, brevity) on the `raw` augmentation.
+  - Captures pre-discussion scores. Post-discussion scores are filled
+    in manually via the second invocation `--phase post`.
+  - Writes (or appends) to `vault/_meta/dashboards/rater-calibration.md`
+    under the canonical history table format.
+
+Three rater types, all wrapped in a single `Rater` Protocol:
+  - HumanRater       interactive (stdin) — for rater-A
+  - AnthropicRater   Anthropic SDK call with rubric system prompt
+  - DeterministicRater  fake; for tests + dry-runs without API access
+
+The Anthropic rater is the production form for rater-B (Sonnet 4.6,
+canonical rubric) and rater-C (Haiku 4.5, accuracy-emphasis rubric).
+
+Run-0 is the bar for ADR-0002 sign-off — without it, the inter-rater
+calibration row in RATERS.md stays TBD and Phase 1 cannot start.
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import json
+import os
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Protocol
+
+try:
+    import yaml
+except ImportError:
+    print("error: PyYAML required", file=sys.stderr)
+    raise
+
+
+DIMENSIONS = ("clarity", "accuracy", "actionability", "brevity")
+
+
+# === Rubric source ===
+
+
+CANONICAL_RUBRIC_SYSTEM = """\
+You are a Chio capability-error rater. Score the explanation on four
+dimensions, each 1–5. Use the canonical rubric anchors:
+
+CLARITY
+  1: Jargon-only; would prompt "what is this saying?"
+  3: Failure identifiable but engineer needs source dive to be sure.
+  5: Failure named and framed; engineer can describe it back without source diving.
+
+ACCURACY
+  1: Contradicts the code path; misleads engineer toward wrong subsystem.
+  3: Correct in spirit but contains inaccurate claims.
+  5: Every claim grounded in code or spec, with citations the engineer can follow.
+
+ACTIONABILITY
+  1: No next step; engineer must invent one.
+  3: General direction ("check the revocation list") but not a specific action.
+  5: Specific next step: file path, command to run, or symbol to inspect.
+
+BREVITY
+  1: More than 2× the minimum necessary length; engineer skims and misses the point.
+  3: Roughly the right length but with paragraphs that could be cut.
+  5: As short as possible but no shorter; nothing skimmable is wasted.
+
+Reply with ONLY a JSON object:
+{
+  "clarity": <int 1-5>,
+  "accuracy": <int 1-5>,
+  "actionability": <int 1-5>,
+  "brevity": <int 1-5>,
+  "rationale": "<one sentence>"
+}
+"""
+
+
+ACCURACY_EMPHASIS_SYSTEM = CANONICAL_RUBRIC_SYSTEM.replace(
+    "ACCURACY\n"
+    "  1: Contradicts the code path; misleads engineer toward wrong subsystem.\n"
+    "  3: Correct in spirit but contains inaccurate claims.\n"
+    "  5: Every claim grounded in code or spec, with citations the engineer can follow.",
+    "ACCURACY (variant — line-number citable)\n"
+    "  1: Contradicts the code path; misleads engineer toward wrong subsystem.\n"
+    "  3: Correct in spirit but contains inaccurate or unverifiable claims.\n"
+    "  5: Every claim is verifiable via line-number citation in the cited file. "
+    "Hand-waving citations like 'see chio-kernel' without line-level grounding cap at 4.",
+)
+
+
+# === Rater protocols ===
+
+
+@dataclass
+class RaterScore:
+    rater_id: str
+    dimensions: dict[str, int] = field(default_factory=dict)
+    rationale: str = ""
+
+    def mean(self) -> float:
+        if not self.dimensions:
+            return 0.0
+        return sum(self.dimensions.values()) / len(self.dimensions)
+
+
+class Rater(Protocol):
+    rater_id: str
+
+    def score(self, scenario: dict[str, Any], augmentation_body: str) -> RaterScore: ...
+
+
+# === Concrete rater impls ===
+
+
+@dataclass
+class DeterministicRater:
+    """Fake rater. Returns deterministic scores derived from the
+    scenario id + augmentation body. For dry-runs and tests.
+    """
+
+    rater_id: str
+    base_scores: dict[str, int] = field(default_factory=lambda: {
+        "clarity": 3, "accuracy": 3, "actionability": 3, "brevity": 4,
+    })
+
+    def score(self, scenario: dict[str, Any], augmentation_body: str) -> RaterScore:
+        return RaterScore(
+            rater_id=self.rater_id,
+            dimensions=dict(self.base_scores),
+            rationale=f"deterministic stub for {self.rater_id}",
+        )
+
+
+@dataclass
+class HumanRater:
+    """Interactive rater. Prompts the user via stdin for each dimension.
+
+    Useful when a human (rater-A = @connor in our pool) is available.
+    Skips with `--skip-human` flag in the CLI.
+    """
+
+    rater_id: str
+    rubric_system: str = CANONICAL_RUBRIC_SYSTEM
+
+    def score(self, scenario: dict[str, Any], augmentation_body: str) -> RaterScore:
+        print(f"\n=== Rating as {self.rater_id} ===")
+        print(f"Scenario id: {scenario.get('id')}")
+        print(f"Scenario:\n{scenario.get('scenario', '')}")
+        print(f"\nAugmentation body to rate:\n{augmentation_body}\n")
+        scores: dict[str, int] = {}
+        for d in DIMENSIONS:
+            while True:
+                raw = input(f"  {d} (1-5): ").strip()
+                try:
+                    v = int(raw)
+                    if 1 <= v <= 5:
+                        scores[d] = v
+                        break
+                except ValueError:
+                    pass
+                print("    enter an integer 1-5")
+        rationale = input("  rationale (one line, optional): ").strip()
+        return RaterScore(rater_id=self.rater_id, dimensions=scores, rationale=rationale)
+
+
+@dataclass
+class AnthropicRater:
+    """Production LLM rater. Lazy-imports anthropic; falls back to
+    raising RuntimeError with a clear message if the SDK is missing or
+    ANTHROPIC_API_KEY is unset.
+    """
+
+    rater_id: str
+    model: str
+    rubric_system: str = CANONICAL_RUBRIC_SYSTEM
+
+    def score(self, scenario: dict[str, Any], augmentation_body: str) -> RaterScore:
+        try:
+            import anthropic  # type: ignore
+        except ImportError as e:
+            raise RuntimeError(
+                "anthropic SDK not installed. "
+                "`uv pip install anthropic` or use --dry-run."
+            ) from e
+        if not os.environ.get("ANTHROPIC_API_KEY"):
+            raise RuntimeError(
+                "ANTHROPIC_API_KEY not set. Export it or use --dry-run."
+            )
+        client = anthropic.Anthropic()
+        user = (
+            f"Scenario id: {scenario.get('id')}\n\n"
+            f"Scenario:\n{scenario.get('scenario', '')}\n\n"
+            f"Augmentation body to rate:\n{augmentation_body}\n"
+        )
+        msg = client.messages.create(
+            model=self.model,
+            max_tokens=512,
+            system=self.rubric_system,
+            messages=[{"role": "user", "content": user}],
+        )
+        text = msg.content[0].text  # type: ignore[attr-defined]
+        return _parse_score(self.rater_id, text)
+
+
+def _parse_score(rater_id: str, text: str) -> RaterScore:
+    """Best-effort JSON extraction from the model's response."""
+    start = text.find("{")
+    end = text.rfind("}")
+    if start == -1 or end == -1:
+        return RaterScore(rater_id=rater_id, rationale=f"unparseable: {text[:200]}")
+    try:
+        data = json.loads(text[start : end + 1])
+    except json.JSONDecodeError:
+        return RaterScore(rater_id=rater_id, rationale=f"json parse failed: {text[:200]}")
+    dims = {d: int(data.get(d, 0)) for d in DIMENSIONS if d in data}
+    return RaterScore(
+        rater_id=rater_id,
+        dimensions=dims,
+        rationale=str(data.get("rationale", "")),
+    )
+
+
+# === Calibration runner ===
+
+
+def default_pool() -> list[Rater]:
+    """Three raters per ADR-0004 (Phase 0 pool). Construct fresh each
+    time to keep state isolated across runs.
+    """
+    return [
+        HumanRater(rater_id="rater-A", rubric_system=CANONICAL_RUBRIC_SYSTEM),
+        AnthropicRater(
+            rater_id="rater-B",
+            model="claude-sonnet-4-6",
+            rubric_system=CANONICAL_RUBRIC_SYSTEM,
+        ),
+        AnthropicRater(
+            rater_id="rater-C",
+            model="claude-haiku-4-5-20251001",
+            rubric_system=ACCURACY_EMPHASIS_SYSTEM,
+        ),
+    ]
+
+
+def dry_run_pool() -> list[Rater]:
+    """Three deterministic raters for dry-run + tests."""
+    return [
+        DeterministicRater(rater_id="rater-A", base_scores={
+            "clarity": 3, "accuracy": 4, "actionability": 4, "brevity": 4,
+        }),
+        DeterministicRater(rater_id="rater-B", base_scores={
+            "clarity": 4, "accuracy": 4, "actionability": 3, "brevity": 5,
+        }),
+        DeterministicRater(rater_id="rater-C", base_scores={
+            "clarity": 3, "accuracy": 3, "actionability": 4, "brevity": 4,
+        }),
+    ]
+
+
+@dataclass
+class CalibrationRun:
+    run_number: int
+    date: str
+    scenario_id: str
+    augmentation_name: str
+    scores: list[RaterScore] = field(default_factory=list)
+
+    def disagreement_flags(self) -> dict[str, tuple[int, int]]:
+        """For each dimension, return (max-rater, min-rater) where
+        max - min > 1. Empty if no disagreement.
+        """
+        flags: dict[str, tuple[int, int]] = {}
+        for d in DIMENSIONS:
+            vals = [s.dimensions.get(d) for s in self.scores if d in s.dimensions]
+            vals = [v for v in vals if v is not None]
+            if len(vals) < 2:
+                continue
+            mx, mn = max(vals), min(vals)
+            if mx - mn > 1:
+                flags[d] = (mx, mn)
+        return flags
+
+
+def calibrate(
+    scenario_path: Path,
+    pool: list[Rater],
+    *,
+    augmentation_name: str = "raw",
+    run_number: int = 0,
+) -> CalibrationRun:
+    """Run one calibration round on one scenario × one augmentation."""
+    with scenario_path.open() as f:
+        scenario = yaml.safe_load(f)
+    augs = scenario.get("augmentations_under_test", [])
+    aug = next((a for a in augs if a.get("name") == augmentation_name), None)
+    if aug is None:
+        raise ValueError(
+            f"scenario {scenario_path} has no augmentation named "
+            f"{augmentation_name!r}. Choices: {[a.get('name') for a in augs]}"
+        )
+    body = aug.get("body") or json.dumps(aug.get("body_source", {}), indent=2)
+
+    run = CalibrationRun(
+        run_number=run_number,
+        date=_dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%d"),
+        scenario_id=scenario.get("id", scenario_path.stem),
+        augmentation_name=augmentation_name,
+    )
+    for rater in pool:
+        try:
+            score = rater.score(scenario, body)
+        except Exception as e:
+            score = RaterScore(rater_id=rater.rater_id, rationale=f"error: {e}")
+        run.scores.append(score)
+    return run
+
+
+def render_calibration_md(run: CalibrationRun) -> str:
+    """Render a calibration run as a markdown rows-block."""
+    lines = []
+    for s in run.scores:
+        for d in DIMENSIONS:
+            val = s.dimensions.get(d, "TBD")
+            lines.append(
+                f"| {run.run_number} | {run.date} | {s.rater_id} | {d} | "
+                f"{val} | TBD | TBD | {s.rationale[:60]} |"
+            )
+    return "\n".join(lines)
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    default_scenario = (
+        Path(__file__).resolve().parents[2]
+        / "eval" / "fixtures" / "cap-error-explanation"
+        / "revoked-cap-still-presented.yml"
+    )
+    p.add_argument("--scenario", default=str(default_scenario))
+    p.add_argument("--augmentation", default="raw")
+    p.add_argument("--run-number", type=int, default=0)
+    p.add_argument("--dry-run", action="store_true",
+                   help="Use deterministic raters; no API calls / no human input.")
+    p.add_argument("--report", default=None,
+                   help="Append rendered rows to this rater-calibration.md.")
+    args = p.parse_args()
+
+    pool = dry_run_pool() if args.dry_run else default_pool()
+    run = calibrate(
+        Path(args.scenario), pool,
+        augmentation_name=args.augmentation,
+        run_number=args.run_number,
+    )
+
+    out = {
+        "run_number": run.run_number,
+        "date": run.date,
+        "scenario_id": run.scenario_id,
+        "augmentation": run.augmentation_name,
+        "scores": [
+            {
+                "rater_id": s.rater_id,
+                "dimensions": s.dimensions,
+                "mean": s.mean(),
+                "rationale": s.rationale,
+            }
+            for s in run.scores
+        ],
+        "disagreement_flags": run.disagreement_flags(),
+    }
+    print(json.dumps(out, indent=2))
+
+    if args.report:
+        Path(args.report).parent.mkdir(parents=True, exist_ok=True)
+        with Path(args.report).open("a") as f:
+            f.write("\n<!-- calibration appended " + run.date + " -->\n")
+            f.write(render_calibration_md(run))
+            f.write("\n")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/chio-pack/tests/test_calibration.py b/chio-pack/tests/test_calibration.py
new file mode 100644
index 0000000..9ae0aa3
--- /dev/null
+++ b/chio-pack/tests/test_calibration.py
@@ -0,0 +1,102 @@
+"""Tests for the cap-error calibration harness."""
+from __future__ import annotations
+
+import pathlib
+
+import pytest
+
+from chio_pack.eval import calibration
+from chio_pack.eval.calibration import (
+    CalibrationRun,
+    DeterministicRater,
+    RaterScore,
+    calibrate,
+    render_calibration_md,
+)
+
+
+SCENARIO = (
+    pathlib.Path(__file__).resolve().parents[1]
+    / "eval" / "fixtures" / "cap-error-explanation"
+    / "revoked-cap-still-presented.yml"
+)
+
+
+def test_dry_run_pool_produces_three_raters():
+    pool = calibration.dry_run_pool()
+    assert len(pool) == 3
+    assert {r.rater_id for r in pool} == {"rater-A", "rater-B", "rater-C"}
+
+
+def test_calibrate_against_real_scenario():
+    pool = calibration.dry_run_pool()
+    run = calibrate(SCENARIO, pool, augmentation_name="raw", run_number=0)
+    assert run.scenario_id == "revoked-cap-still-presented"
+    assert run.augmentation_name == "raw"
+    assert len(run.scores) == 3
+    for s in run.scores:
+        assert set(s.dimensions.keys()) == {
+            "clarity", "accuracy", "actionability", "brevity",
+        }
+
+
+def test_calibrate_unknown_augmentation_raises():
+    pool = calibration.dry_run_pool()
+    with pytest.raises(ValueError, match="no augmentation named"):
+        calibrate(SCENARIO, pool, augmentation_name="not-a-real-aug")
+
+
+def test_disagreement_flags_detected():
+    run = CalibrationRun(
+        run_number=0, date="2026-05-08",
+        scenario_id="x", augmentation_name="raw",
+        scores=[
+            RaterScore(rater_id="A", dimensions={"clarity": 5, "accuracy": 3}),
+            RaterScore(rater_id="B", dimensions={"clarity": 3, "accuracy": 3}),
+            RaterScore(rater_id="C", dimensions={"clarity": 4, "accuracy": 3}),
+        ],
+    )
+    flags = run.disagreement_flags()
+    assert "clarity" in flags  # max 5, min 3, diff > 1
+    assert flags["clarity"] == (5, 3)
+    assert "accuracy" not in flags  # all 3, no disagreement
+
+
+def test_disagreement_flag_threshold_is_strict_greater_than_one():
+    run = CalibrationRun(
+        run_number=0, date="2026-05-08",
+        scenario_id="x", augmentation_name="raw",
+        scores=[
+            RaterScore(rater_id="A", dimensions={"clarity": 3}),
+            RaterScore(rater_id="B", dimensions={"clarity": 4}),
+        ],
+    )
+    # Diff of 1 should NOT flag
+    assert run.disagreement_flags() == {}
+
+
+def test_render_calibration_md_produces_rows():
+    pool = calibration.dry_run_pool()
+    run = calibrate(SCENARIO, pool, augmentation_name="raw")
+    md = render_calibration_md(run)
+    # 3 raters × 4 dimensions = 12 rows
+    assert md.count("\n") == 11  # 12 rows = 11 newlines
+    for d in ("clarity", "accuracy", "actionability", "brevity"):
+        assert d in md
+
+
+def test_rater_score_mean():
+    s = RaterScore(rater_id="x", dimensions={
+        "clarity": 5, "accuracy": 4, "actionability": 3, "brevity": 4,
+    })
+    assert s.mean() == 4.0
+
+
+def test_deterministic_rater_returns_configured_scores():
+    r = DeterministicRater(rater_id="test", base_scores={
+        "clarity": 5, "accuracy": 5, "actionability": 1, "brevity": 1,
+    })
+    score = r.score({"id": "x", "scenario": "..."}, "irrelevant body")
+    assert score.dimensions == {
+        "clarity": 5, "accuracy": 5, "actionability": 1, "brevity": 1,
+    }