fix(release): wire pre_release_check.py + e2b nightly + advisory tier (P0-4) (#414)

0xmariowu · web-flow · commit 5e70fa4ab56b · 2026-04-26T02:23:24.000Z
* docs(release-policy): mandatory / advisory / nightly check matrix (P0-4 S1)

* feat(pre-release): split mandatory vs advisory check tiers (P0-4 S2 source)

* test(pre-release): cover mandatory/advisory split + exit semantics (P0-4 S2 tests)

* ci(release): wire pre_release_check.py mandatory tier into release.yml (P0-4 S3)

* ci(nightly): new e2b-nightly.yml runs matrix-release-gate daily 02:00 UTC (P0-4 S4)

* style(tests): ruff format pre_release_check test (P0-4 S2 followup)

* fix(p0-4): address PR review (lambda-free + ASCII labels + SHA-pinned action + doc tweaks)

* fix(p0-4): address remaining review (docstring + FAIL marker + GH_TOKEN + open-PR error path)
diff --git a/.github/workflows/e2b-nightly.yml b/.github/workflows/e2b-nightly.yml
@@ -0,0 +1,90 @@
+name: E2B Nightly — Release Gate Matrix
+
+# Runs the e2b matrix-release-gate scenarios in cloud sandboxes once a day.
+# This is the AUTHORITATIVE pre-release E2E test for fresh-install + first-use
+# paths that local pytest cannot cover (real OS image, real pipx install,
+# real PyPI dry-run). Per docs/release-policy.md it is NOT in release.yml —
+# E2B sandbox runs cost ~$0.25 each + 5-10 min wall time, too expensive
+# to gate every PR or every release. Daily cadence is sufficient: main
+# averages a few merges per day, and a regression caught at 02:00 UTC the
+# next day is still ahead of any human user impact.
+#
+# On failure: opens a GitHub issue tagged `release-gate` so engineering
+# triages before the next release tag.
+
+on:
+  schedule:
+    - cron: "0 2 * * *"   # UTC 02:00 daily
+  workflow_dispatch:       # manual trigger
+
+permissions:
+  contents: read
+  issues: write           # to open failure issues
+
+jobs:
+  e2b-matrix-release-gate:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+
+      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
+        with:
+          python-version: "3.12"
+
+      - name: Install e2b runner deps
+        run: pip install -e ".[dev]"
+
+      - name: Stage secrets file
+        env:
+          E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
+          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          mkdir -p ~/.config
+          {
+            echo "E2B_API_KEY=${E2B_API_KEY}"
+            echo "OPENROUTER_API_KEY=${OPENROUTER_API_KEY}"
+            echo "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}"
+          } > ~/.config/ai-secrets.env
+          chmod 600 ~/.config/ai-secrets.env
+
+      - name: Run matrix-release-gate
+        id: matrix
+        run: |
+          python scripts/e2b/run_validation.py \
+            --project autosearch-release-gate \
+            --matrix tests/e2b/matrix-release-gate.yaml \
+            --output reports/e2b-nightly \
+            --source-dir . \
+            --parallel 4
+
+      - name: Upload reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: e2b-nightly-reports
+          path: reports/e2b-nightly/
+          retention-days: 14
+
+      - name: Open failure issue
+        if: failure()
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea  # v7.0.1
+        with:
+          script: |
+            const title = `E2B nightly release-gate FAILED — ${new Date().toISOString().slice(0, 10)}`;
+            const body = `Nightly E2B matrix-release-gate run failed.
+
+            **Run:** ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}
+            **Workflow:** \`.github/workflows/e2b-nightly.yml\`
+            **Matrix:** \`tests/e2b/matrix-release-gate.yaml\`
+
+            Engineering triage required BEFORE the next release tag is pushed.
+            See \`docs/release-policy.md\` § Nightly checks for the policy.`;
+            await github.rest.issues.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              title,
+              body,
+              labels: ['release-gate', 'nightly-failure'],
+            });
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -78,16 +78,20 @@ jobs:
       - name: Install autosearch (for release-gate CLI surface checks)
         run: pip install -e ".[dev]"
 
-      # Note: pre_release_check.py is intentionally NOT wired into the
-      # release workflow — it bundles checks (channel experience dirs
-      # bootstrap state, Gate 12 bench results) that are useful as a
-      # local pre-tag developer tool but aren't release blockers when
-      # those artifacts haven't been generated. Run it locally via
-      # `python scripts/validate/pre_release_check.py` before tagging.
-
       - name: Run release-gate.sh --quick --pypi (version + uniqueness + lint + CLI surface)
         run: bash scripts/release-gate.sh --quick --pypi
 
+      # pre_release_check.py mandatory tier — gates release. Advisory tier
+      # (Gate 12 bench) prints warnings but does not fail this step. See
+      # docs/release-policy.md for the full mandatory / advisory / nightly
+      # matrix. GH_TOKEN is REQUIRED so `_check_open_prs()` can authenticate
+      # `gh pr list`; without it, the call fails open and a release-blocker
+      # PR could silently slip through this gate.
+      - name: Run pre_release_check.py (mandatory checks)
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python scripts/validate/pre_release_check.py --allow-stale-gate12
+
       - name: Install build backend
         run: pip install --upgrade "build>=1.2" "twine>=6.1" "setuptools>=80"
 
diff --git a/docs/release-policy.md b/docs/release-policy.md
@@ -0,0 +1,73 @@
+# Release Policy
+
+> Source of truth for which checks gate a release vs. which checks merely
+> inform. The release pipeline (`.github/workflows/release.yml`) wires the
+> mandatory checks; nightly workflows run the advisory ones separately.
+
+## Principle
+
+CI gates **regression**, not **quality**. A release pipeline must refuse to
+publish anything that fails a deterministic correctness check, but it must
+not block on slow / probabilistic / network-dependent quality benches —
+those drift from green-to-red for reasons unrelated to the artifact under
+release. Quality measurements live in nightly and weekly cadence.
+
+## Mandatory checks (release MUST fail if any of these fail)
+
+These run inside `release.yml` after the build job and before publish. They
+are wired through `scripts/validate/pre_release_check.py` (mandatory subset)
+plus `scripts/release-gate.sh --quick --pypi`. Failure stops the publish
+step.
+
+| Check | Where defined | Why mandatory |
+|---|---|---|
+| Version 4-file consistency | `pre_release_check.py:_check_version_consistency` | A mismatch ships a broken artifact (different version in wheel vs. plugin manifest). |
+| SKILL.md format compliance | `pre_release_check.py:_check_skill_format` | Unloadable skill files break runtime channel routing. |
+| Channel experience dirs initialized | `pre_release_check.py:_check_experience_dirs` | Missing dirs cause silent runtime failures the moment a user invokes a channel. |
+| MCP tools registered (10 v2 contract tools) | `pre_release_check.py:_check_mcp_tools` | A fresh install where the MCP layer is missing tools is unusable. |
+| Open PR release blockers (label-gated) | `pre_release_check.py:_check_open_prs` | A release while a `release-blocker` PR is open ships a known-broken artifact. |
+| Git working tree clean | `pre_release_check.py:_check_git_clean` | Releasing with uncommitted changes means the published artifact does not match any commit. |
+| Local version uniqueness | `release-gate.sh --quick` (`check_version_uniqueness.py --mode=local`) | Tag collision destroys the release. |
+| PyPI version uniqueness | `release-gate.sh --pypi` (`check_version_uniqueness.py --mode=pypi`) | PyPI rejects upload of an already-published version; release fails halfway. |
+| Lint + format (ruff) | `release-gate.sh --quick` | Already enforced on every PR; serves as belt-and-braces here. |
+| CLI surface smoke (`autosearch --help`, `mcp-check`, `doctor --json`) | `release-gate.sh --quick` | Catches packaging breakage that unit tests miss. |
+
+## Advisory checks (release continues; failures are reported but not fatal)
+
+These appear in `pre_release_check.py` output prefixed with `[WARN]
+[advisory]` and are summarized in the `ADVISORY: N/M passed` line. They
+surface signal but do not change the script's exit code; the release
+pipeline keeps going.
+
+| Check | Where defined | Why advisory |
+|---|---|---|
+| Gate 12 bench ≥ 50% (augment-vs-bare) | `pre_release_check.py:_check_gate12_bench` | Real-LLM bench, slow + probabilistic. A green bench costs ~$5 and 15 min; running it inside release.yml every patch release is wasteful. Drift between bench and HEAD is normal. Failures here flag *quality regression candidates* for human triage, not release blockers. |
+
+## Nightly / out-of-band checks (NOT in release.yml; run on schedule)
+
+These run in dedicated workflows. They do not block any PR or release; they
+post results (or open an issue on failure) for engineering follow-up.
+
+| Check | Workflow | Cadence | Why out of band |
+|---|---|---|---|
+| E2B matrix release gate | `.github/workflows/e2b-nightly.yml` | Daily 02:00 UTC | E2B sandbox runs cost ~$0.25 each and 5-10 min wall time. The matrix exercises real install + first-use across multiple scenarios. Catches install-path regressions that only show up in a clean OS image. Daily cadence is enough — main gets at most a few merges per day. |
+| Cross-platform install (Windows / macOS) | `.github/workflows/cross-platform.yml` | Weekly Monday 03:00 UTC | Slow runners (~15 min) and rarely catches anything new. Weekly is enough for Tier-2 platforms. |
+| Live integration tests (real APIs) | `.github/workflows/nightly.yml` | Daily 02:00 UTC | Hits external APIs (Anthropic, OpenAI, GitHub, etc.). Real spend, real rate limits — cannot be on every PR. |
+
+## How to change this policy
+
+1. Edit this file.
+2. If a check moves from mandatory → advisory, move it from
+   `MANDATORY_CHECKS` to `ADVISORY_CHECKS` in
+   `scripts/validate/pre_release_check.py`. Mandatory failures set the exit
+   code to 1; advisory failures only emit a `[WARN] [advisory]` line.
+   Reverse direction: move it back into `MANDATORY_CHECKS`.
+3. If a check moves into / out of `release.yml`, edit the workflow.
+4. Open one PR with all three changes. Title: `policy(release): <what>`.
+   Reference this doc in the PR body.
+
+## Audit trail
+
+| Date | Change | Driver |
+|---|---|---|
+| 2026-04-26 | Initial version. Gate 12 → advisory. E2B matrix → nightly. | P0-4 from `autosearch-0425-p0-scan-report.md`. The release pipeline was bypassing `pre_release_check.py` entirely; this policy spells out exactly which subset must fire. |
diff --git a/scripts/validate/pre_release_check.py b/scripts/validate/pre_release_check.py
@@ -1,8 +1,12 @@
 #!/usr/bin/env python3
 """G7-T1: Pre-release checklist — runs all fast checks before v1.0 tag.
 
-Usage: python scripts/validate/pre_release_check.py
-Exit 0 = all checks pass. Exit 1 = one or more fail.
+Usage: python scripts/validate/pre_release_check.py [--allow-stale-gate12]
+
+Exit code semantics (per docs/release-policy.md):
+  - Exit 0 = all MANDATORY checks pass. Advisory failures are reported but
+    do not change the exit code.
+  - Exit 1 = at least one MANDATORY check failed. Release must not proceed.
 """
 
 from __future__ import annotations
@@ -13,10 +17,14 @@
 import subprocess
 import sys
 from pathlib import Path
+from typing import Callable
 
 ROOT = Path(__file__).resolve().parents[2]
 SCRIPTS = ROOT / "scripts" / "validate"
 
+CheckFn = Callable[[], tuple[bool, str]]
+AdvisoryCheckFn = Callable[..., tuple[bool, str]]
+
 
 def _run(label: str, cmd: list[str]) -> tuple[bool, str]:
     result = subprocess.run(cmd, capture_output=True, text=True, cwd=ROOT)
@@ -162,25 +170,38 @@ def _label_names(pr: dict[str, object]) -> set[str]:
 
 
 def _check_open_prs() -> tuple[bool, str]:
-    result = subprocess.run(
-        ["gh", "pr", "list", "--state", "open", "--json", "number,title,labels"],
-        capture_output=True,
-        text=True,
-        cwd=ROOT,
-        env={**os.environ, "GITHUB_TOKEN": ""},
-    )
+    # Locally we strip GITHUB_TOKEN so gh falls back to keychain auth (the
+    # ambient token is sometimes scopeless). In CI we keep GH_TOKEN — it is
+    # the only auth path.
+    env = {**os.environ, "GITHUB_TOKEN": ""} if not os.environ.get("GH_TOKEN") else os.environ
+    try:
+        result = subprocess.run(
+            ["gh", "pr", "list", "--state", "open", "--json", "number,title,labels"],
+            capture_output=True,
+            text=True,
+            cwd=ROOT,
+            env=env,
+        )
+    except FileNotFoundError:
+        # `gh` not installed — only acceptable in a dev sandbox without the
+        # CLI. CI workflows that wire this gate must have `gh` available.
+        return True, "gh CLI not installed — skipping PR check (dev env only)"
     if result.returncode != 0:
-        return True, "gh not available — skipping PR check"
+        # gh is present but failed (auth, network, rate limit). Don't fail
+        # open — silent passes here historically let release-blocker PRs
+        # slip through CI.
+        stderr_first = result.stderr.strip().splitlines()[0] if result.stderr else "no stderr"
+        return False, f"gh pr list failed (exit {result.returncode}): {stderr_first}"
     try:
         prs = json.loads(result.stdout)
-        blockers = [p for p in prs if "release-blocker" in _label_names(p)]
-        summary = f"{len(prs)} open PRs ({len(blockers)} release-blockers)"
-        if blockers:
-            titles = [f"#{p['number']}" for p in blockers[:3]]
-            return False, f"{summary}: {', '.join(titles)}"
-        return True, summary
-    except Exception:
-        return True, "could not parse gh output — skipping"
+    except json.JSONDecodeError as exc:
+        return False, f"could not parse gh output: {exc}"
+    blockers = [p for p in prs if "release-blocker" in _label_names(p)]
+    summary = f"{len(prs)} open PRs ({len(blockers)} release-blockers)"
+    if blockers:
+        titles = [f"#{p['number']}" for p in blockers[:3]]
+        return False, f"{summary}: {', '.join(titles)}"
+    return True, summary
 
 
 def _check_git_clean() -> tuple[bool, str]:
@@ -196,6 +217,42 @@ def _check_git_clean() -> tuple[bool, str]:
     return True, "working tree clean"
 
 
+MANDATORY_CHECKS: list[tuple[str, CheckFn]] = [
+    ("Version 4-file consistency", _check_version_consistency),
+    ("SKILL.md format", _check_skill_format),
+    ("Channel experience dirs", _check_experience_dirs),
+    ("MCP tools registered", _check_mcp_tools),
+    ("Open PR release blockers", _check_open_prs),
+    ("Git working tree clean", _check_git_clean),
+]
+
+ADVISORY_CHECKS: list[tuple[str, AdvisoryCheckFn]] = [
+    ("Gate 12 bench ≥ 50%", _check_gate12_bench),
+]
+
+
+def _run_mandatory_checks() -> list[tuple[str, bool, str]]:
+    results: list[tuple[str, bool, str]] = []
+    for label, fn in MANDATORY_CHECKS:
+        try:
+            ok, msg = fn()
+        except Exception as exc:
+            ok, msg = False, f"ERROR: {exc}"
+        results.append((label, ok, msg))
+    return results
+
+
+def _run_advisory_checks(*, allow_stale_gate12: bool) -> list[tuple[str, bool, str]]:
+    results: list[tuple[str, bool, str]] = []
+    for label, fn in ADVISORY_CHECKS:
+        try:
+            ok, msg = fn(allow_stale=allow_stale_gate12)
+        except Exception as exc:
+            ok, msg = False, f"ERROR: {exc}"
+        results.append((label, ok, msg))
+    return results
+
+
 def main(argv: list[str] | None = None) -> int:
     parser = argparse.ArgumentParser(description="Run fast pre-release checks.")
     parser.add_argument(
@@ -205,43 +262,36 @@ def main(argv: list[str] | None = None) -> int:
     )
     args = parser.parse_args(argv)
 
-    checks = [
-        ("Version 4-file consistency", _check_version_consistency),
-        ("SKILL.md format compliance", _check_skill_format),
-        ("Channel experience dirs", _check_experience_dirs),
-        ("MCP tools registered", _check_mcp_tools),
-        ("Gate 12 bench ≥ 50%", lambda: _check_gate12_bench(allow_stale=args.allow_stale_gate12)),
-        ("Open PR release blockers", _check_open_prs),
-        ("Git working tree clean", _check_git_clean),
-    ]
-
-    results: list[tuple[str, bool, str]] = []
-    for label, fn in checks:
-        try:
-            ok, msg = fn()
-        except Exception as exc:
-            ok, msg = False, f"ERROR: {exc}"
-        results.append((label, ok, msg))
+    mandatory_results = _run_mandatory_checks()
+    advisory_results = _run_advisory_checks(allow_stale_gate12=args.allow_stale_gate12)
 
     print()
     print("=" * 62)
     print("  AutoSearch Pre-Release Checklist")
     print("=" * 62)
-    all_pass = True
-    for label, ok, msg in results:
-        symbol = "✅" if ok else "❌"
-        print(f"  {symbol}  {label}")
-        print(f"       {msg}")
+    mandatory_pass = True
+    for label, ok, msg in mandatory_results:
+        symbol = "PASS" if ok else "FAIL"
+        print(f"  [{symbol}] [mandatory] {label}")
+        print(f"        {msg}")
         if not ok:
-            all_pass = False
+            mandatory_pass = False
+    for label, ok, msg in advisory_results:
+        symbol = "PASS" if ok else "WARN"
+        print(f"  [{symbol}] [advisory] {label}")
+        print(f"        {msg}")
+    mandatory_count = sum(1 for _, ok, _ in mandatory_results if ok)
+    advisory_count = sum(1 for _, ok, _ in advisory_results if ok)
     print("=" * 62)
-    if all_pass:
-        print("  ALL CHECKS PASSED — ready for v1.0 tag")
+    print(f"  MANDATORY: {mandatory_count}/{len(mandatory_results)} passed")
+    print(f"  ADVISORY: {advisory_count}/{len(advisory_results)} passed")
+    if mandatory_pass:
+        print("  MANDATORY CHECKS PASSED — ready for v1.0 tag")
         print("  Next: scripts/bump-version.sh → git tag v1.0.0 → git push --tags")
     else:
-        print("  SOME CHECKS FAILED — fix before tagging")
+        print("  MANDATORY CHECKS FAILED — fix before tagging")
     print()
-    return 0 if all_pass else 1
+    return 0 if mandatory_pass else 1
 
 
 if __name__ == "__main__":
diff --git a/tests/scripts/test_pre_release_check.py b/tests/scripts/test_pre_release_check.py