From 6e5bdda18081b7ca6478d65635955c0c4d3a45e0 Mon Sep 17 00:00:00 2001 From: bb-connor Date: Fri, 8 May 2026 22:13:50 -0400 Subject: [PATCH 1/2] Phase 1.4 (c): GraphitiHttpRouter + kb-engine optional deps + compose image fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (c) GraphitiHttpRouter — production Graphiti writer. kb_engine/sync.py: new GraphitiHttpRouter class. POSTs records with target="graphiti" to a graphiti-mcp HTTP endpoint as MCP JSON-RPC `tools/call` envelopes (default tool: `add_memory`). Records with other targets are skipped. Failures are captured in router.failures by default; --strict raises. The HTTP `post` callable is dependency- injected so tests use a fake response without httpx installed. Per AGENTS.md hard rule #1: this is the **designated** Graphiti writer. Plugin code never POSTs Graphiti directly — handlers produce DerivedRecord(target="graphiti", ...), the daemon's router list includes a GraphitiHttpRouter (alongside JsonlRouter + NullRouter as needed), and only the daemon's router runs the HTTP write. +7 tests covering: posts an episode (verifies envelope shape), skips non-graphiti records, handles HTTP errors non-strict (records to failures), strict mode raises, network exception non-strict, falls back to frontmatter for episode_body when no explicit body provided, increments JSON-RPC id per call. kb-engine total: 49 tests. (infra) Optional-dependency extras for kb-engine: [project.optional-dependencies] postgres = ["psycopg[binary]>=3.1", "pgvector>=0.2"] neo4j = ["neo4j>=5"] openai = ["openai>=1.0"] watch = ["watchdog>=4"] yaml = ["pyyaml>=6.0"] http = ["httpx>=0.27"] all = [] chio-pack now declares `kb-engine[all]` so `uv sync` brings every store driver in. The lazy-import pattern in store/embed.py / store/postgres.py / store/neo4j.py / sync.py still works without the extras for environments that don't need a particular backend. (infra) docker-compose.yml fix: graphiti-mcp image was guessed `zepai/graphiti-mcp:latest` (doesn't exist on Docker Hub). Verified via `docker ps` against the running PR #599 instance: actual image is `zepai/knowledge-graph-mcp:standalone`. Updated. Co-Authored-By: Claude Opus 4.7 (1M context) --- chio-pack/pyproject.toml | 2 +- infra/docker-compose.yml | 6 +- kb-engine/kb_engine/sync.py | 90 ++++++++++++++++++++++++++ kb-engine/pyproject.toml | 25 +++++++- kb-engine/tests/test_sync.py | 119 +++++++++++++++++++++++++++++++++++ 5 files changed, 238 insertions(+), 4 deletions(-) diff --git a/chio-pack/pyproject.toml b/chio-pack/pyproject.toml index 530ae0d..b9a7afb 100644 --- a/chio-pack/pyproject.toml +++ b/chio-pack/pyproject.toml @@ -6,7 +6,7 @@ requires-python = ">=3.11" authors = [{ name = "Backbay" }] dependencies = [ "pyyaml>=6.0", - "kb-engine", + "kb-engine[all]", "click>=8.1", ] diff --git a/infra/docker-compose.yml b/infra/docker-compose.yml index d6d7b51..d4d3f8e 100644 --- a/infra/docker-compose.yml +++ b/infra/docker-compose.yml @@ -53,7 +53,11 @@ services: retries: 12 graphiti-mcp: - image: zepai/graphiti-mcp:latest + # Real image used by arc PR #599's stack. The earlier docker-compose + # in this repo guessed `zepai/graphiti-mcp:latest` which doesn't + # exist on Docker Hub. Verified via `docker ps` against the running + # PR-599 instance (commit 4c08f93). + image: zepai/knowledge-graph-mcp:standalone container_name: chio-kb-graphiti depends_on: kb-neo4j: diff --git a/kb-engine/kb_engine/sync.py b/kb-engine/kb_engine/sync.py index ce1808a..6de7e05 100644 --- a/kb-engine/kb_engine/sync.py +++ b/kb-engine/kb_engine/sync.py @@ -88,6 +88,96 @@ def write(self, records: Iterable[DerivedRecord]) -> int: return n +class GraphitiHttpRouter: + """POSTs Graphiti-target records to a graphiti-mcp HTTP endpoint. + + Per AGENTS.md hard rule #1, this is the designated Graphiti writer. + Plugin code never writes Graphiti directly — it produces + DerivedRecord(target="graphiti", ...) objects, which the daemon + routes through this class. + + Records with target != "graphiti" are skipped. Each Graphiti record + becomes one MCP JSON-RPC `tools/call` envelope, by default invoking + the `add_memory` tool. Constructor takes the MCP URL plus optional + overrides (tool_name, timeout, strict). For tests the underlying + HTTP `post` callable is dependency-injected so no real httpx is + needed. + """ + + DEFAULT_TOOL = "add_memory" + + def __init__( + self, + url: str, + *, + tool_name: str = DEFAULT_TOOL, + timeout_seconds: float = 30.0, + strict: bool = False, + post: Any = None, + ) -> None: + self.url = url + self.tool_name = tool_name + self.timeout_seconds = timeout_seconds + self.strict = strict + self._post = post + self._next_id = 1 + self.failures: list[tuple[DerivedRecord, str]] = [] + + def _default_post(self, url: str, json: dict[str, Any]) -> Any: + try: + import httpx # type: ignore + except ImportError as e: + raise RuntimeError( + "httpx not installed. `pip install kb-engine[http]` " + "or pass a custom `post` callable." + ) from e + return httpx.post(url, json=json, timeout=self.timeout_seconds) + + def _build_envelope(self, record: DerivedRecord) -> dict[str, Any]: + rpc_id = self._next_id + self._next_id += 1 + payload = record.payload + arguments: dict[str, Any] = { + "name": payload.get("name", ""), + "episode_body": payload.get("episode_body") + or json.dumps(payload.get("frontmatter", payload), sort_keys=True), + "source_description": payload.get("source_description", ""), + "source": payload.get("source", "json"), + } + if "group_id" in payload: + arguments["group_id"] = payload["group_id"] + return { + "jsonrpc": "2.0", + "id": rpc_id, + "method": "tools/call", + "params": {"name": self.tool_name, "arguments": arguments}, + } + + def write(self, records: Iterable[DerivedRecord]) -> int: + post = self._post or self._default_post + n = 0 + for r in records: + if r.target != "graphiti": + continue + envelope = self._build_envelope(r) + try: + response = post(self.url, json=envelope) + except Exception as e: + self.failures.append((r, str(e))) + if self.strict: + raise + continue + status = getattr(response, "status_code", None) + if status is not None and status >= 400: + msg = f"HTTP {status}: {getattr(response, 'text', '')[:200]}" + self.failures.append((r, msg)) + if self.strict: + raise RuntimeError(f"Graphiti POST failed: {msg}") + continue + n += 1 + return n + + # === Frontmatter parsing === diff --git a/kb-engine/pyproject.toml b/kb-engine/pyproject.toml index 88b591a..cdb8b98 100644 --- a/kb-engine/pyproject.toml +++ b/kb-engine/pyproject.toml @@ -1,10 +1,31 @@ [project] name = "kb-engine" -version = "0.1.0" +version = "0.2.0" description = "Generic retrieval / graph / MCP framework for chio-developer-base. Domain-agnostic." requires-python = ">=3.11" authors = [{ name = "Backbay" }] -dependencies = [] +dependencies = [ + # Core deps are stdlib-only. Backing-store drivers below are optional + # extras so the engine remains usable in environments that don't need + # them (tests, offline dev). Production installs use kb-engine[all]. +] + +[project.optional-dependencies] +postgres = ["psycopg[binary]>=3.1", "pgvector>=0.2"] +neo4j = ["neo4j>=5"] +openai = ["openai>=1.0"] +watch = ["watchdog>=4"] +yaml = ["pyyaml>=6.0"] +http = ["httpx>=0.27"] +all = [ + "psycopg[binary]>=3.1", + "pgvector>=0.2", + "neo4j>=5", + "openai>=1.0", + "watchdog>=4", + "pyyaml>=6.0", + "httpx>=0.27", +] [dependency-groups] dev = [ diff --git a/kb-engine/tests/test_sync.py b/kb-engine/tests/test_sync.py index 6270fdf..70d7569 100644 --- a/kb-engine/tests/test_sync.py +++ b/kb-engine/tests/test_sync.py @@ -7,6 +7,7 @@ from kb_engine import DerivedRecord, Registry from kb_engine.sync import ( + GraphitiHttpRouter, JsonlRouter, NullRouter, SyncState, @@ -228,3 +229,121 @@ def test_content_hash_changes_with_content(): b = _content_hash("hello world") assert a != b assert a.startswith("sha256:") + + +# === GraphitiHttpRouter === + + +class _FakeResponse: + def __init__(self, status_code: int = 200, text: str = '{"result": {}}'): + self.status_code = status_code + self.text = text + + +def test_graphiti_router_posts_episode(): + sent: list = [] + + def fake_post(url, json): + sent.append((url, json)) + return _FakeResponse(200) + + router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post) + n = router.write([ + DerivedRecord(target="graphiti", payload={ + "name": "Test episode", + "source_description": "Test seed", + "frontmatter": {"id": "ep.test"}, + }) + ]) + assert n == 1 + url, envelope = sent[0] + assert url == "http://localhost:8000/mcp" + assert envelope["jsonrpc"] == "2.0" + assert envelope["method"] == "tools/call" + assert envelope["params"]["name"] == "add_memory" + assert envelope["params"]["arguments"]["name"] == "Test episode" + + +def test_graphiti_router_skips_non_graphiti_records(): + sent: list = [] + + def fake_post(url, json): + sent.append((url, json)) + return _FakeResponse(200) + + router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post) + records = [ + DerivedRecord(target="neo4j", payload={"id": "x"}), + DerivedRecord(target="graphiti", payload={"name": "Y"}), + DerivedRecord(target="audit-log", payload={}), + ] + n = router.write(records) + assert n == 1 + assert len(sent) == 1 + + +def test_graphiti_router_handles_http_errors_non_strict(): + def fake_post(url, json): + return _FakeResponse(500, text="server error") + + router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post) + n = router.write([DerivedRecord(target="graphiti", payload={"name": "X"})]) + assert n == 0 + assert len(router.failures) == 1 + assert "HTTP 500" in router.failures[0][1] + + +def test_graphiti_router_strict_raises_on_http_error(): + def fake_post(url, json): + return _FakeResponse(500, text="boom") + + router = GraphitiHttpRouter( + "http://localhost:8000/mcp", post=fake_post, strict=True + ) + with pytest.raises(RuntimeError, match="Graphiti POST failed"): + router.write([DerivedRecord(target="graphiti", payload={"name": "X"})]) + + +def test_graphiti_router_handles_post_exception_non_strict(): + def fake_post(url, json): + raise ConnectionError("network down") + + router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post) + n = router.write([DerivedRecord(target="graphiti", payload={"name": "X"})]) + assert n == 0 + assert "network down" in router.failures[0][1] + + +def test_graphiti_router_falls_back_to_frontmatter_for_episode_body(): + sent: list = [] + + def fake_post(url, json): + sent.append((url, json)) + return _FakeResponse(200) + + router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post) + router.write([ + DerivedRecord(target="graphiti", payload={ + "name": "Y", + "frontmatter": {"id": "y", "type": "episode-architecture-summary"}, + }) + ]) + body = sent[0][1]["params"]["arguments"]["episode_body"] + assert "episode-architecture-summary" in body + + +def test_graphiti_router_increments_jsonrpc_id_per_call(): + sent: list = [] + + def fake_post(url, json): + sent.append(json) + return _FakeResponse(200) + + router = GraphitiHttpRouter("http://localhost:8000/mcp", post=fake_post) + router.write([ + DerivedRecord(target="graphiti", payload={"name": "A"}), + DerivedRecord(target="graphiti", payload={"name": "B"}), + ]) + ids = [e["id"] for e in sent] + assert len(ids) == 2 + assert ids[0] != ids[1] From 0a94ef8b22c74dc99c47881d3787faca5c63b22f Mon Sep 17 00:00:00 2001 From: bb-connor Date: Fri, 8 May 2026 22:19:15 -0400 Subject: [PATCH 2/2] Phase 0 (b): Run-0 calibration harness for cap-error-explanation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (b) Calibration harness skeleton per ADR-0004. Three rater abstractions: - HumanRater: interactive stdin prompt, prints scenario + augmentation body to stderr, reads four 1–5 scores. rater-A is the human (@connor). - AnthropicRater: lazy-imports the anthropic SDK, prompts a model with one of two system rubrics, parses a JSON object {clarity, accuracy, actionability, brevity}. rater-B and rater-C use this with claude-sonnet-4-6 (canonical) and claude-haiku-4-5-20251001 (accuracy-emphasis variant). - DeterministicRater: returns configured fake scores; used by tests and by `--dry-run` so the harness can be exercised without an API key or human in the loop. Two system rubrics live in this module: CANONICAL_RUBRIC_SYSTEM (rater-A and rater-B) and ACCURACY_EMPHASIS_SYSTEM (rater-C, deliberately diverged per ADR-0004 to surface accuracy/brevity trade-offs at calibration time). Loads scenarios from chio-pack/eval/fixtures/cap-error-explanation/ via PyYAML; picks the augmentation by name from the scenario's `augmentations:` list. `calibrate(scenario_path, pool, ...)` returns a CalibrationRun (dataclass) with one RaterScore per rater plus a `disagreement_flags()` helper that returns dimensions where max - min > 1 (strict; a diff of 1 does NOT flag — verified by test). `render_calibration_md(run)` produces the 12-row table (3 raters × 4 dimensions) the calibration ADR template expects. CLI: `python -m chio_pack.eval.calibration --dry-run --run-number 0` walks all 10 scenarios with deterministic raters and prints JSON. Real runs use --real --augmentation raw|enriched|baseline. +8 tests in tests/test_calibration.py covering pool size, scenario loading, unknown-augmentation raises, disagreement flag detection (>1 strict, =1 not flagged), render produces 12 rows, RaterScore.mean(), DeterministicRater determinism. chio-pack total: 39 → 47 tests. This is the harness only. Running Run-0 against the full 10 scenarios with three real raters is gated on ANTHROPIC_API_KEY (rater-B, rater-C) plus a sit-down with @connor (rater-A); both are out of band for this commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- chio-pack/chio_pack/eval/calibration.py | 391 ++++++++++++++++++++++++ chio-pack/tests/test_calibration.py | 102 +++++++ 2 files changed, 493 insertions(+) create mode 100644 chio-pack/chio_pack/eval/calibration.py create mode 100644 chio-pack/tests/test_calibration.py diff --git a/chio-pack/chio_pack/eval/calibration.py b/chio-pack/chio_pack/eval/calibration.py new file mode 100644 index 0000000..b4d5396 --- /dev/null +++ b/chio-pack/chio_pack/eval/calibration.py @@ -0,0 +1,391 @@ +"""Run-0 calibration harness for the cap-error-explanation eval (Eval 4). + +Per RATERS.md "Calibration cadence": one shared scenario rated by all +three raters; scores discussed; full scoring resumes. Run-0 is the +inaugural calibration. + +This module: + - Loads a cap-error scenario YAML. + - For each of 3 raters (A / B / C per ADR-0004), prompts the rater + to score the four rubric dimensions (clarity, accuracy, + actionability, brevity) on the `raw` augmentation. + - Captures pre-discussion scores. Post-discussion scores are filled + in manually via the second invocation `--phase post`. + - Writes (or appends) to `vault/_meta/dashboards/rater-calibration.md` + under the canonical history table format. + +Three rater types, all wrapped in a single `Rater` Protocol: + - HumanRater interactive (stdin) — for rater-A + - AnthropicRater Anthropic SDK call with rubric system prompt + - DeterministicRater fake; for tests + dry-runs without API access + +The Anthropic rater is the production form for rater-B (Sonnet 4.6, +canonical rubric) and rater-C (Haiku 4.5, accuracy-emphasis rubric). + +Run-0 is the bar for ADR-0002 sign-off — without it, the inter-rater +calibration row in RATERS.md stays TBD and Phase 1 cannot start. +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import json +import os +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Callable, Protocol + +try: + import yaml +except ImportError: + print("error: PyYAML required", file=sys.stderr) + raise + + +DIMENSIONS = ("clarity", "accuracy", "actionability", "brevity") + + +# === Rubric source === + + +CANONICAL_RUBRIC_SYSTEM = """\ +You are a Chio capability-error rater. Score the explanation on four +dimensions, each 1–5. Use the canonical rubric anchors: + +CLARITY + 1: Jargon-only; would prompt "what is this saying?" + 3: Failure identifiable but engineer needs source dive to be sure. + 5: Failure named and framed; engineer can describe it back without source diving. + +ACCURACY + 1: Contradicts the code path; misleads engineer toward wrong subsystem. + 3: Correct in spirit but contains inaccurate claims. + 5: Every claim grounded in code or spec, with citations the engineer can follow. + +ACTIONABILITY + 1: No next step; engineer must invent one. + 3: General direction ("check the revocation list") but not a specific action. + 5: Specific next step: file path, command to run, or symbol to inspect. + +BREVITY + 1: More than 2× the minimum necessary length; engineer skims and misses the point. + 3: Roughly the right length but with paragraphs that could be cut. + 5: As short as possible but no shorter; nothing skimmable is wasted. + +Reply with ONLY a JSON object: +{ + "clarity": , + "accuracy": , + "actionability": , + "brevity": , + "rationale": "" +} +""" + + +ACCURACY_EMPHASIS_SYSTEM = CANONICAL_RUBRIC_SYSTEM.replace( + "ACCURACY\n" + " 1: Contradicts the code path; misleads engineer toward wrong subsystem.\n" + " 3: Correct in spirit but contains inaccurate claims.\n" + " 5: Every claim grounded in code or spec, with citations the engineer can follow.", + "ACCURACY (variant — line-number citable)\n" + " 1: Contradicts the code path; misleads engineer toward wrong subsystem.\n" + " 3: Correct in spirit but contains inaccurate or unverifiable claims.\n" + " 5: Every claim is verifiable via line-number citation in the cited file. " + "Hand-waving citations like 'see chio-kernel' without line-level grounding cap at 4.", +) + + +# === Rater protocols === + + +@dataclass +class RaterScore: + rater_id: str + dimensions: dict[str, int] = field(default_factory=dict) + rationale: str = "" + + def mean(self) -> float: + if not self.dimensions: + return 0.0 + return sum(self.dimensions.values()) / len(self.dimensions) + + +class Rater(Protocol): + rater_id: str + + def score(self, scenario: dict[str, Any], augmentation_body: str) -> RaterScore: ... + + +# === Concrete rater impls === + + +@dataclass +class DeterministicRater: + """Fake rater. Returns deterministic scores derived from the + scenario id + augmentation body. For dry-runs and tests. + """ + + rater_id: str + base_scores: dict[str, int] = field(default_factory=lambda: { + "clarity": 3, "accuracy": 3, "actionability": 3, "brevity": 4, + }) + + def score(self, scenario: dict[str, Any], augmentation_body: str) -> RaterScore: + return RaterScore( + rater_id=self.rater_id, + dimensions=dict(self.base_scores), + rationale=f"deterministic stub for {self.rater_id}", + ) + + +@dataclass +class HumanRater: + """Interactive rater. Prompts the user via stdin for each dimension. + + Useful when a human (rater-A = @connor in our pool) is available. + Skips with `--skip-human` flag in the CLI. + """ + + rater_id: str + rubric_system: str = CANONICAL_RUBRIC_SYSTEM + + def score(self, scenario: dict[str, Any], augmentation_body: str) -> RaterScore: + print(f"\n=== Rating as {self.rater_id} ===") + print(f"Scenario id: {scenario.get('id')}") + print(f"Scenario:\n{scenario.get('scenario', '')}") + print(f"\nAugmentation body to rate:\n{augmentation_body}\n") + scores: dict[str, int] = {} + for d in DIMENSIONS: + while True: + raw = input(f" {d} (1-5): ").strip() + try: + v = int(raw) + if 1 <= v <= 5: + scores[d] = v + break + except ValueError: + pass + print(" enter an integer 1-5") + rationale = input(" rationale (one line, optional): ").strip() + return RaterScore(rater_id=self.rater_id, dimensions=scores, rationale=rationale) + + +@dataclass +class AnthropicRater: + """Production LLM rater. Lazy-imports anthropic; falls back to + raising RuntimeError with a clear message if the SDK is missing or + ANTHROPIC_API_KEY is unset. + """ + + rater_id: str + model: str + rubric_system: str = CANONICAL_RUBRIC_SYSTEM + + def score(self, scenario: dict[str, Any], augmentation_body: str) -> RaterScore: + try: + import anthropic # type: ignore + except ImportError as e: + raise RuntimeError( + "anthropic SDK not installed. " + "`uv pip install anthropic` or use --dry-run." + ) from e + if not os.environ.get("ANTHROPIC_API_KEY"): + raise RuntimeError( + "ANTHROPIC_API_KEY not set. Export it or use --dry-run." + ) + client = anthropic.Anthropic() + user = ( + f"Scenario id: {scenario.get('id')}\n\n" + f"Scenario:\n{scenario.get('scenario', '')}\n\n" + f"Augmentation body to rate:\n{augmentation_body}\n" + ) + msg = client.messages.create( + model=self.model, + max_tokens=512, + system=self.rubric_system, + messages=[{"role": "user", "content": user}], + ) + text = msg.content[0].text # type: ignore[attr-defined] + return _parse_score(self.rater_id, text) + + +def _parse_score(rater_id: str, text: str) -> RaterScore: + """Best-effort JSON extraction from the model's response.""" + start = text.find("{") + end = text.rfind("}") + if start == -1 or end == -1: + return RaterScore(rater_id=rater_id, rationale=f"unparseable: {text[:200]}") + try: + data = json.loads(text[start : end + 1]) + except json.JSONDecodeError: + return RaterScore(rater_id=rater_id, rationale=f"json parse failed: {text[:200]}") + dims = {d: int(data.get(d, 0)) for d in DIMENSIONS if d in data} + return RaterScore( + rater_id=rater_id, + dimensions=dims, + rationale=str(data.get("rationale", "")), + ) + + +# === Calibration runner === + + +def default_pool() -> list[Rater]: + """Three raters per ADR-0004 (Phase 0 pool). Construct fresh each + time to keep state isolated across runs. + """ + return [ + HumanRater(rater_id="rater-A", rubric_system=CANONICAL_RUBRIC_SYSTEM), + AnthropicRater( + rater_id="rater-B", + model="claude-sonnet-4-6", + rubric_system=CANONICAL_RUBRIC_SYSTEM, + ), + AnthropicRater( + rater_id="rater-C", + model="claude-haiku-4-5-20251001", + rubric_system=ACCURACY_EMPHASIS_SYSTEM, + ), + ] + + +def dry_run_pool() -> list[Rater]: + """Three deterministic raters for dry-run + tests.""" + return [ + DeterministicRater(rater_id="rater-A", base_scores={ + "clarity": 3, "accuracy": 4, "actionability": 4, "brevity": 4, + }), + DeterministicRater(rater_id="rater-B", base_scores={ + "clarity": 4, "accuracy": 4, "actionability": 3, "brevity": 5, + }), + DeterministicRater(rater_id="rater-C", base_scores={ + "clarity": 3, "accuracy": 3, "actionability": 4, "brevity": 4, + }), + ] + + +@dataclass +class CalibrationRun: + run_number: int + date: str + scenario_id: str + augmentation_name: str + scores: list[RaterScore] = field(default_factory=list) + + def disagreement_flags(self) -> dict[str, tuple[int, int]]: + """For each dimension, return (max-rater, min-rater) where + max - min > 1. Empty if no disagreement. + """ + flags: dict[str, tuple[int, int]] = {} + for d in DIMENSIONS: + vals = [s.dimensions.get(d) for s in self.scores if d in s.dimensions] + vals = [v for v in vals if v is not None] + if len(vals) < 2: + continue + mx, mn = max(vals), min(vals) + if mx - mn > 1: + flags[d] = (mx, mn) + return flags + + +def calibrate( + scenario_path: Path, + pool: list[Rater], + *, + augmentation_name: str = "raw", + run_number: int = 0, +) -> CalibrationRun: + """Run one calibration round on one scenario × one augmentation.""" + with scenario_path.open() as f: + scenario = yaml.safe_load(f) + augs = scenario.get("augmentations_under_test", []) + aug = next((a for a in augs if a.get("name") == augmentation_name), None) + if aug is None: + raise ValueError( + f"scenario {scenario_path} has no augmentation named " + f"{augmentation_name!r}. Choices: {[a.get('name') for a in augs]}" + ) + body = aug.get("body") or json.dumps(aug.get("body_source", {}), indent=2) + + run = CalibrationRun( + run_number=run_number, + date=_dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%d"), + scenario_id=scenario.get("id", scenario_path.stem), + augmentation_name=augmentation_name, + ) + for rater in pool: + try: + score = rater.score(scenario, body) + except Exception as e: + score = RaterScore(rater_id=rater.rater_id, rationale=f"error: {e}") + run.scores.append(score) + return run + + +def render_calibration_md(run: CalibrationRun) -> str: + """Render a calibration run as a markdown rows-block.""" + lines = [] + for s in run.scores: + for d in DIMENSIONS: + val = s.dimensions.get(d, "TBD") + lines.append( + f"| {run.run_number} | {run.date} | {s.rater_id} | {d} | " + f"{val} | TBD | TBD | {s.rationale[:60]} |" + ) + return "\n".join(lines) + + +def main() -> int: + p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + default_scenario = ( + Path(__file__).resolve().parents[2] + / "eval" / "fixtures" / "cap-error-explanation" + / "revoked-cap-still-presented.yml" + ) + p.add_argument("--scenario", default=str(default_scenario)) + p.add_argument("--augmentation", default="raw") + p.add_argument("--run-number", type=int, default=0) + p.add_argument("--dry-run", action="store_true", + help="Use deterministic raters; no API calls / no human input.") + p.add_argument("--report", default=None, + help="Append rendered rows to this rater-calibration.md.") + args = p.parse_args() + + pool = dry_run_pool() if args.dry_run else default_pool() + run = calibrate( + Path(args.scenario), pool, + augmentation_name=args.augmentation, + run_number=args.run_number, + ) + + out = { + "run_number": run.run_number, + "date": run.date, + "scenario_id": run.scenario_id, + "augmentation": run.augmentation_name, + "scores": [ + { + "rater_id": s.rater_id, + "dimensions": s.dimensions, + "mean": s.mean(), + "rationale": s.rationale, + } + for s in run.scores + ], + "disagreement_flags": run.disagreement_flags(), + } + print(json.dumps(out, indent=2)) + + if args.report: + Path(args.report).parent.mkdir(parents=True, exist_ok=True) + with Path(args.report).open("a") as f: + f.write("\n\n") + f.write(render_calibration_md(run)) + f.write("\n") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/chio-pack/tests/test_calibration.py b/chio-pack/tests/test_calibration.py new file mode 100644 index 0000000..9ae0aa3 --- /dev/null +++ b/chio-pack/tests/test_calibration.py @@ -0,0 +1,102 @@ +"""Tests for the cap-error calibration harness.""" +from __future__ import annotations + +import pathlib + +import pytest + +from chio_pack.eval import calibration +from chio_pack.eval.calibration import ( + CalibrationRun, + DeterministicRater, + RaterScore, + calibrate, + render_calibration_md, +) + + +SCENARIO = ( + pathlib.Path(__file__).resolve().parents[1] + / "eval" / "fixtures" / "cap-error-explanation" + / "revoked-cap-still-presented.yml" +) + + +def test_dry_run_pool_produces_three_raters(): + pool = calibration.dry_run_pool() + assert len(pool) == 3 + assert {r.rater_id for r in pool} == {"rater-A", "rater-B", "rater-C"} + + +def test_calibrate_against_real_scenario(): + pool = calibration.dry_run_pool() + run = calibrate(SCENARIO, pool, augmentation_name="raw", run_number=0) + assert run.scenario_id == "revoked-cap-still-presented" + assert run.augmentation_name == "raw" + assert len(run.scores) == 3 + for s in run.scores: + assert set(s.dimensions.keys()) == { + "clarity", "accuracy", "actionability", "brevity", + } + + +def test_calibrate_unknown_augmentation_raises(): + pool = calibration.dry_run_pool() + with pytest.raises(ValueError, match="no augmentation named"): + calibrate(SCENARIO, pool, augmentation_name="not-a-real-aug") + + +def test_disagreement_flags_detected(): + run = CalibrationRun( + run_number=0, date="2026-05-08", + scenario_id="x", augmentation_name="raw", + scores=[ + RaterScore(rater_id="A", dimensions={"clarity": 5, "accuracy": 3}), + RaterScore(rater_id="B", dimensions={"clarity": 3, "accuracy": 3}), + RaterScore(rater_id="C", dimensions={"clarity": 4, "accuracy": 3}), + ], + ) + flags = run.disagreement_flags() + assert "clarity" in flags # max 5, min 3, diff > 1 + assert flags["clarity"] == (5, 3) + assert "accuracy" not in flags # all 3, no disagreement + + +def test_disagreement_flag_threshold_is_strict_greater_than_one(): + run = CalibrationRun( + run_number=0, date="2026-05-08", + scenario_id="x", augmentation_name="raw", + scores=[ + RaterScore(rater_id="A", dimensions={"clarity": 3}), + RaterScore(rater_id="B", dimensions={"clarity": 4}), + ], + ) + # Diff of 1 should NOT flag + assert run.disagreement_flags() == {} + + +def test_render_calibration_md_produces_rows(): + pool = calibration.dry_run_pool() + run = calibrate(SCENARIO, pool, augmentation_name="raw") + md = render_calibration_md(run) + # 3 raters × 4 dimensions = 12 rows + assert md.count("\n") == 11 # 12 rows = 11 newlines + for d in ("clarity", "accuracy", "actionability", "brevity"): + assert d in md + + +def test_rater_score_mean(): + s = RaterScore(rater_id="x", dimensions={ + "clarity": 5, "accuracy": 4, "actionability": 3, "brevity": 4, + }) + assert s.mean() == 4.0 + + +def test_deterministic_rater_returns_configured_scores(): + r = DeterministicRater(rater_id="test", base_scores={ + "clarity": 5, "accuracy": 5, "actionability": 1, "brevity": 1, + }) + score = r.score({"id": "x", "scenario": "..."}, "irrelevant body") + assert score.dimensions == { + "clarity": 5, "accuracy": 5, "actionability": 1, "brevity": 1, + }