Skip to content

Commit 5e70fa4

Browse files
authored
fix(release): wire pre_release_check.py + e2b nightly + advisory tier (P0-4) (#414)
* docs(release-policy): mandatory / advisory / nightly check matrix (P0-4 S1) * feat(pre-release): split mandatory vs advisory check tiers (P0-4 S2 source) * test(pre-release): cover mandatory/advisory split + exit semantics (P0-4 S2 tests) * ci(release): wire pre_release_check.py mandatory tier into release.yml (P0-4 S3) * ci(nightly): new e2b-nightly.yml runs matrix-release-gate daily 02:00 UTC (P0-4 S4) * style(tests): ruff format pre_release_check test (P0-4 S2 followup) * fix(p0-4): address PR review (lambda-free + ASCII labels + SHA-pinned action + doc tweaks) * fix(p0-4): address remaining review (docstring + FAIL marker + GH_TOKEN + open-PR error path)
1 parent 9233b48 commit 5e70fa4

5 files changed

Lines changed: 342 additions & 59 deletions

File tree

.github/workflows/e2b-nightly.yml

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
name: E2B Nightly — Release Gate Matrix
2+
3+
# Runs the e2b matrix-release-gate scenarios in cloud sandboxes once a day.
4+
# This is the AUTHORITATIVE pre-release E2E test for fresh-install + first-use
5+
# paths that local pytest cannot cover (real OS image, real pipx install,
6+
# real PyPI dry-run). Per docs/release-policy.md it is NOT in release.yml —
7+
# E2B sandbox runs cost ~$0.25 each + 5-10 min wall time, too expensive
8+
# to gate every PR or every release. Daily cadence is sufficient: main
9+
# averages a few merges per day, and a regression caught at 02:00 UTC the
10+
# next day is still ahead of any human user impact.
11+
#
12+
# On failure: opens a GitHub issue tagged `release-gate` so engineering
13+
# triages before the next release tag.
14+
15+
on:
16+
schedule:
17+
- cron: "0 2 * * *" # UTC 02:00 daily
18+
workflow_dispatch: # manual trigger
19+
20+
permissions:
21+
contents: read
22+
issues: write # to open failure issues
23+
24+
jobs:
25+
e2b-matrix-release-gate:
26+
runs-on: ubuntu-latest
27+
timeout-minutes: 60
28+
steps:
29+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
30+
31+
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
32+
with:
33+
python-version: "3.12"
34+
35+
- name: Install e2b runner deps
36+
run: pip install -e ".[dev]"
37+
38+
- name: Stage secrets file
39+
env:
40+
E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
41+
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
42+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
43+
run: |
44+
mkdir -p ~/.config
45+
{
46+
echo "E2B_API_KEY=${E2B_API_KEY}"
47+
echo "OPENROUTER_API_KEY=${OPENROUTER_API_KEY}"
48+
echo "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}"
49+
} > ~/.config/ai-secrets.env
50+
chmod 600 ~/.config/ai-secrets.env
51+
52+
- name: Run matrix-release-gate
53+
id: matrix
54+
run: |
55+
python scripts/e2b/run_validation.py \
56+
--project autosearch-release-gate \
57+
--matrix tests/e2b/matrix-release-gate.yaml \
58+
--output reports/e2b-nightly \
59+
--source-dir . \
60+
--parallel 4
61+
62+
- name: Upload reports
63+
if: always()
64+
uses: actions/upload-artifact@v4
65+
with:
66+
name: e2b-nightly-reports
67+
path: reports/e2b-nightly/
68+
retention-days: 14
69+
70+
- name: Open failure issue
71+
if: failure()
72+
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
73+
with:
74+
script: |
75+
const title = `E2B nightly release-gate FAILED — ${new Date().toISOString().slice(0, 10)}`;
76+
const body = `Nightly E2B matrix-release-gate run failed.
77+
78+
**Run:** ${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}
79+
**Workflow:** \`.github/workflows/e2b-nightly.yml\`
80+
**Matrix:** \`tests/e2b/matrix-release-gate.yaml\`
81+
82+
Engineering triage required BEFORE the next release tag is pushed.
83+
See \`docs/release-policy.md\` § Nightly checks for the policy.`;
84+
await github.rest.issues.create({
85+
owner: context.repo.owner,
86+
repo: context.repo.repo,
87+
title,
88+
body,
89+
labels: ['release-gate', 'nightly-failure'],
90+
});

.github/workflows/release.yml

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,20 @@ jobs:
7878
- name: Install autosearch (for release-gate CLI surface checks)
7979
run: pip install -e ".[dev]"
8080

81-
# Note: pre_release_check.py is intentionally NOT wired into the
82-
# release workflow — it bundles checks (channel experience dirs
83-
# bootstrap state, Gate 12 bench results) that are useful as a
84-
# local pre-tag developer tool but aren't release blockers when
85-
# those artifacts haven't been generated. Run it locally via
86-
# `python scripts/validate/pre_release_check.py` before tagging.
87-
8881
- name: Run release-gate.sh --quick --pypi (version + uniqueness + lint + CLI surface)
8982
run: bash scripts/release-gate.sh --quick --pypi
9083

84+
# pre_release_check.py mandatory tier — gates release. Advisory tier
85+
# (Gate 12 bench) prints warnings but does not fail this step. See
86+
# docs/release-policy.md for the full mandatory / advisory / nightly
87+
# matrix. GH_TOKEN is REQUIRED so `_check_open_prs()` can authenticate
88+
# `gh pr list`; without it, the call fails open and a release-blocker
89+
# PR could silently slip through this gate.
90+
- name: Run pre_release_check.py (mandatory checks)
91+
env:
92+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
93+
run: python scripts/validate/pre_release_check.py --allow-stale-gate12
94+
9195
- name: Install build backend
9296
run: pip install --upgrade "build>=1.2" "twine>=6.1" "setuptools>=80"
9397

docs/release-policy.md

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Release Policy
2+
3+
> Source of truth for which checks gate a release vs. which checks merely
4+
> inform. The release pipeline (`.github/workflows/release.yml`) wires the
5+
> mandatory checks; nightly workflows run the advisory ones separately.
6+
7+
## Principle
8+
9+
CI gates **regression**, not **quality**. A release pipeline must refuse to
10+
publish anything that fails a deterministic correctness check, but it must
11+
not block on slow / probabilistic / network-dependent quality benches —
12+
those drift from green-to-red for reasons unrelated to the artifact under
13+
release. Quality measurements live in nightly and weekly cadence.
14+
15+
## Mandatory checks (release MUST fail if any of these fail)
16+
17+
These run inside `release.yml` after the build job and before publish. They
18+
are wired through `scripts/validate/pre_release_check.py` (mandatory subset)
19+
plus `scripts/release-gate.sh --quick --pypi`. Failure stops the publish
20+
step.
21+
22+
| Check | Where defined | Why mandatory |
23+
|---|---|---|
24+
| Version 4-file consistency | `pre_release_check.py:_check_version_consistency` | A mismatch ships a broken artifact (different version in wheel vs. plugin manifest). |
25+
| SKILL.md format compliance | `pre_release_check.py:_check_skill_format` | Unloadable skill files break runtime channel routing. |
26+
| Channel experience dirs initialized | `pre_release_check.py:_check_experience_dirs` | Missing dirs cause silent runtime failures the moment a user invokes a channel. |
27+
| MCP tools registered (10 v2 contract tools) | `pre_release_check.py:_check_mcp_tools` | A fresh install where the MCP layer is missing tools is unusable. |
28+
| Open PR release blockers (label-gated) | `pre_release_check.py:_check_open_prs` | A release while a `release-blocker` PR is open ships a known-broken artifact. |
29+
| Git working tree clean | `pre_release_check.py:_check_git_clean` | Releasing with uncommitted changes means the published artifact does not match any commit. |
30+
| Local version uniqueness | `release-gate.sh --quick` (`check_version_uniqueness.py --mode=local`) | Tag collision destroys the release. |
31+
| PyPI version uniqueness | `release-gate.sh --pypi` (`check_version_uniqueness.py --mode=pypi`) | PyPI rejects upload of an already-published version; release fails halfway. |
32+
| Lint + format (ruff) | `release-gate.sh --quick` | Already enforced on every PR; serves as belt-and-braces here. |
33+
| CLI surface smoke (`autosearch --help`, `mcp-check`, `doctor --json`) | `release-gate.sh --quick` | Catches packaging breakage that unit tests miss. |
34+
35+
## Advisory checks (release continues; failures are reported but not fatal)
36+
37+
These appear in `pre_release_check.py` output prefixed with `[WARN]
38+
[advisory]` and are summarized in the `ADVISORY: N/M passed` line. They
39+
surface signal but do not change the script's exit code; the release
40+
pipeline keeps going.
41+
42+
| Check | Where defined | Why advisory |
43+
|---|---|---|
44+
| Gate 12 bench ≥ 50% (augment-vs-bare) | `pre_release_check.py:_check_gate12_bench` | Real-LLM bench, slow + probabilistic. A green bench costs ~$5 and 15 min; running it inside release.yml every patch release is wasteful. Drift between bench and HEAD is normal. Failures here flag *quality regression candidates* for human triage, not release blockers. |
45+
46+
## Nightly / out-of-band checks (NOT in release.yml; run on schedule)
47+
48+
These run in dedicated workflows. They do not block any PR or release; they
49+
post results (or open an issue on failure) for engineering follow-up.
50+
51+
| Check | Workflow | Cadence | Why out of band |
52+
|---|---|---|---|
53+
| E2B matrix release gate | `.github/workflows/e2b-nightly.yml` | Daily 02:00 UTC | E2B sandbox runs cost ~$0.25 each and 5-10 min wall time. The matrix exercises real install + first-use across multiple scenarios. Catches install-path regressions that only show up in a clean OS image. Daily cadence is enough — main gets at most a few merges per day. |
54+
| Cross-platform install (Windows / macOS) | `.github/workflows/cross-platform.yml` | Weekly Monday 03:00 UTC | Slow runners (~15 min) and rarely catches anything new. Weekly is enough for Tier-2 platforms. |
55+
| Live integration tests (real APIs) | `.github/workflows/nightly.yml` | Daily 02:00 UTC | Hits external APIs (Anthropic, OpenAI, GitHub, etc.). Real spend, real rate limits — cannot be on every PR. |
56+
57+
## How to change this policy
58+
59+
1. Edit this file.
60+
2. If a check moves from mandatory → advisory, move it from
61+
`MANDATORY_CHECKS` to `ADVISORY_CHECKS` in
62+
`scripts/validate/pre_release_check.py`. Mandatory failures set the exit
63+
code to 1; advisory failures only emit a `[WARN] [advisory]` line.
64+
Reverse direction: move it back into `MANDATORY_CHECKS`.
65+
3. If a check moves into / out of `release.yml`, edit the workflow.
66+
4. Open one PR with all three changes. Title: `policy(release): <what>`.
67+
Reference this doc in the PR body.
68+
69+
## Audit trail
70+
71+
| Date | Change | Driver |
72+
|---|---|---|
73+
| 2026-04-26 | Initial version. Gate 12 → advisory. E2B matrix → nightly. | P0-4 from `autosearch-0425-p0-scan-report.md`. The release pipeline was bypassing `pre_release_check.py` entirely; this policy spells out exactly which subset must fire. |

scripts/validate/pre_release_check.py

Lines changed: 95 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
#!/usr/bin/env python3
22
"""G7-T1: Pre-release checklist — runs all fast checks before v1.0 tag.
33
4-
Usage: python scripts/validate/pre_release_check.py
5-
Exit 0 = all checks pass. Exit 1 = one or more fail.
4+
Usage: python scripts/validate/pre_release_check.py [--allow-stale-gate12]
5+
6+
Exit code semantics (per docs/release-policy.md):
7+
- Exit 0 = all MANDATORY checks pass. Advisory failures are reported but
8+
do not change the exit code.
9+
- Exit 1 = at least one MANDATORY check failed. Release must not proceed.
610
"""
711

812
from __future__ import annotations
@@ -13,10 +17,14 @@
1317
import subprocess
1418
import sys
1519
from pathlib import Path
20+
from typing import Callable
1621

1722
ROOT = Path(__file__).resolve().parents[2]
1823
SCRIPTS = ROOT / "scripts" / "validate"
1924

25+
CheckFn = Callable[[], tuple[bool, str]]
26+
AdvisoryCheckFn = Callable[..., tuple[bool, str]]
27+
2028

2129
def _run(label: str, cmd: list[str]) -> tuple[bool, str]:
2230
result = subprocess.run(cmd, capture_output=True, text=True, cwd=ROOT)
@@ -162,25 +170,38 @@ def _label_names(pr: dict[str, object]) -> set[str]:
162170

163171

164172
def _check_open_prs() -> tuple[bool, str]:
165-
result = subprocess.run(
166-
["gh", "pr", "list", "--state", "open", "--json", "number,title,labels"],
167-
capture_output=True,
168-
text=True,
169-
cwd=ROOT,
170-
env={**os.environ, "GITHUB_TOKEN": ""},
171-
)
173+
# Locally we strip GITHUB_TOKEN so gh falls back to keychain auth (the
174+
# ambient token is sometimes scopeless). In CI we keep GH_TOKEN — it is
175+
# the only auth path.
176+
env = {**os.environ, "GITHUB_TOKEN": ""} if not os.environ.get("GH_TOKEN") else os.environ
177+
try:
178+
result = subprocess.run(
179+
["gh", "pr", "list", "--state", "open", "--json", "number,title,labels"],
180+
capture_output=True,
181+
text=True,
182+
cwd=ROOT,
183+
env=env,
184+
)
185+
except FileNotFoundError:
186+
# `gh` not installed — only acceptable in a dev sandbox without the
187+
# CLI. CI workflows that wire this gate must have `gh` available.
188+
return True, "gh CLI not installed — skipping PR check (dev env only)"
172189
if result.returncode != 0:
173-
return True, "gh not available — skipping PR check"
190+
# gh is present but failed (auth, network, rate limit). Don't fail
191+
# open — silent passes here historically let release-blocker PRs
192+
# slip through CI.
193+
stderr_first = result.stderr.strip().splitlines()[0] if result.stderr else "no stderr"
194+
return False, f"gh pr list failed (exit {result.returncode}): {stderr_first}"
174195
try:
175196
prs = json.loads(result.stdout)
176-
blockers = [p for p in prs if "release-blocker" in _label_names(p)]
177-
summary = f"{len(prs)} open PRs ({len(blockers)} release-blockers)"
178-
if blockers:
179-
titles = [f"#{p['number']}" for p in blockers[:3]]
180-
return False, f"{summary}: {', '.join(titles)}"
181-
return True, summary
182-
except Exception:
183-
return True, "could not parse gh output — skipping"
197+
except json.JSONDecodeError as exc:
198+
return False, f"could not parse gh output: {exc}"
199+
blockers = [p for p in prs if "release-blocker" in _label_names(p)]
200+
summary = f"{len(prs)} open PRs ({len(blockers)} release-blockers)"
201+
if blockers:
202+
titles = [f"#{p['number']}" for p in blockers[:3]]
203+
return False, f"{summary}: {', '.join(titles)}"
204+
return True, summary
184205

185206

186207
def _check_git_clean() -> tuple[bool, str]:
@@ -196,6 +217,42 @@ def _check_git_clean() -> tuple[bool, str]:
196217
return True, "working tree clean"
197218

198219

220+
MANDATORY_CHECKS: list[tuple[str, CheckFn]] = [
221+
("Version 4-file consistency", _check_version_consistency),
222+
("SKILL.md format", _check_skill_format),
223+
("Channel experience dirs", _check_experience_dirs),
224+
("MCP tools registered", _check_mcp_tools),
225+
("Open PR release blockers", _check_open_prs),
226+
("Git working tree clean", _check_git_clean),
227+
]
228+
229+
ADVISORY_CHECKS: list[tuple[str, AdvisoryCheckFn]] = [
230+
("Gate 12 bench ≥ 50%", _check_gate12_bench),
231+
]
232+
233+
234+
def _run_mandatory_checks() -> list[tuple[str, bool, str]]:
235+
results: list[tuple[str, bool, str]] = []
236+
for label, fn in MANDATORY_CHECKS:
237+
try:
238+
ok, msg = fn()
239+
except Exception as exc:
240+
ok, msg = False, f"ERROR: {exc}"
241+
results.append((label, ok, msg))
242+
return results
243+
244+
245+
def _run_advisory_checks(*, allow_stale_gate12: bool) -> list[tuple[str, bool, str]]:
246+
results: list[tuple[str, bool, str]] = []
247+
for label, fn in ADVISORY_CHECKS:
248+
try:
249+
ok, msg = fn(allow_stale=allow_stale_gate12)
250+
except Exception as exc:
251+
ok, msg = False, f"ERROR: {exc}"
252+
results.append((label, ok, msg))
253+
return results
254+
255+
199256
def main(argv: list[str] | None = None) -> int:
200257
parser = argparse.ArgumentParser(description="Run fast pre-release checks.")
201258
parser.add_argument(
@@ -205,43 +262,36 @@ def main(argv: list[str] | None = None) -> int:
205262
)
206263
args = parser.parse_args(argv)
207264

208-
checks = [
209-
("Version 4-file consistency", _check_version_consistency),
210-
("SKILL.md format compliance", _check_skill_format),
211-
("Channel experience dirs", _check_experience_dirs),
212-
("MCP tools registered", _check_mcp_tools),
213-
("Gate 12 bench ≥ 50%", lambda: _check_gate12_bench(allow_stale=args.allow_stale_gate12)),
214-
("Open PR release blockers", _check_open_prs),
215-
("Git working tree clean", _check_git_clean),
216-
]
217-
218-
results: list[tuple[str, bool, str]] = []
219-
for label, fn in checks:
220-
try:
221-
ok, msg = fn()
222-
except Exception as exc:
223-
ok, msg = False, f"ERROR: {exc}"
224-
results.append((label, ok, msg))
265+
mandatory_results = _run_mandatory_checks()
266+
advisory_results = _run_advisory_checks(allow_stale_gate12=args.allow_stale_gate12)
225267

226268
print()
227269
print("=" * 62)
228270
print(" AutoSearch Pre-Release Checklist")
229271
print("=" * 62)
230-
all_pass = True
231-
for label, ok, msg in results:
232-
symbol = "" if ok else ""
233-
print(f" {symbol} {label}")
234-
print(f" {msg}")
272+
mandatory_pass = True
273+
for label, ok, msg in mandatory_results:
274+
symbol = "PASS" if ok else "FAIL"
275+
print(f" [{symbol}] [mandatory] {label}")
276+
print(f" {msg}")
235277
if not ok:
236-
all_pass = False
278+
mandatory_pass = False
279+
for label, ok, msg in advisory_results:
280+
symbol = "PASS" if ok else "WARN"
281+
print(f" [{symbol}] [advisory] {label}")
282+
print(f" {msg}")
283+
mandatory_count = sum(1 for _, ok, _ in mandatory_results if ok)
284+
advisory_count = sum(1 for _, ok, _ in advisory_results if ok)
237285
print("=" * 62)
238-
if all_pass:
239-
print(" ALL CHECKS PASSED — ready for v1.0 tag")
286+
print(f" MANDATORY: {mandatory_count}/{len(mandatory_results)} passed")
287+
print(f" ADVISORY: {advisory_count}/{len(advisory_results)} passed")
288+
if mandatory_pass:
289+
print(" MANDATORY CHECKS PASSED — ready for v1.0 tag")
240290
print(" Next: scripts/bump-version.sh → git tag v1.0.0 → git push --tags")
241291
else:
242-
print(" SOME CHECKS FAILED — fix before tagging")
292+
print(" MANDATORY CHECKS FAILED — fix before tagging")
243293
print()
244-
return 0 if all_pass else 1
294+
return 0 if mandatory_pass else 1
245295

246296

247297
if __name__ == "__main__":

0 commit comments

Comments
 (0)