Skip to content

Commit 4cebe4f

Browse files
committed
add: pb export CLI and GitHub Actions workflow for auto site updates
- Add `pb export` command that generates GitHub Pages-compatible leaderboard JSON and figures CSV from evaluation results - Add GitHub Actions workflow that auto-creates PRs on parallelbench.github.io when results/ changes are pushed - Refactor unmasking_registry to use lazy imports (removes torch dependency for metadata-only operations) - Extract shared data loading functions from analyze.py to parallelbench/analysis/data_loading.py
1 parent da69fa4 commit 4cebe4f

11 files changed

Lines changed: 1244 additions & 194 deletions

File tree

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: Export results to GitHub Pages
2+
3+
on:
4+
push:
5+
branches: [main]
6+
paths: ['results/**']
7+
workflow_dispatch:
8+
9+
jobs:
10+
export:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- uses: actions/checkout@v4
14+
15+
- uses: astral-sh/setup-uv@v4
16+
17+
- name: Install dependencies
18+
run: uv sync --no-dev
19+
20+
- name: Export results to site data
21+
run: uv run pb export --output /tmp/site-data
22+
23+
- uses: actions/checkout@v4
24+
with:
25+
repository: parallelbench/parallelbench.github.io
26+
token: ${{ secrets.PAGES_PAT }}
27+
path: site-repo
28+
29+
- name: Update site data
30+
run: |
31+
cp -r /tmp/site-data/leaderboard/* site-repo/data/leaderboard/
32+
cp -r /tmp/site-data/figures/* site-repo/data/figures/
33+
34+
- name: Create PR
35+
uses: peter-evans/create-pull-request@v7
36+
with:
37+
path: site-repo
38+
token: ${{ secrets.PAGES_PAT }}
39+
commit-message: "update: experiment results from ParallelBench"
40+
title: "update: experiment results"
41+
body: |
42+
Automated export from ParallelBench results.
43+
44+
Source commit: ${{ github.sha }}
45+
branch: auto-export/update-results
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
"""Shared data loading utilities for ParallelBench analysis.
2+
3+
Provides functions and constants for scanning result directories,
4+
parsing result JSON files, and collecting rows for analysis.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import json
10+
import logging
11+
import re
12+
from pathlib import Path
13+
14+
from parallelbench.models.unmasking_registry import get_all_config_params
15+
16+
logger = logging.getLogger(__name__)
17+
18+
# Matches run directories named with timestamp prefix: YYYYMMDD_HHMMSS
19+
TIMESTAMP_DIR_RE = re.compile(r"^\d{8}_\d{6}")
20+
21+
METRIC_KEYS = [
22+
"score",
23+
"score_strict",
24+
"nfe",
25+
"tokens_per_step",
26+
"input_length",
27+
"output_length",
28+
]
29+
30+
_BASE_GENERATION_KWARGS_KEYS = [
31+
"k",
32+
"steps",
33+
"block_length",
34+
"unmasking",
35+
"max_tokens",
36+
"temperature",
37+
"alg_temp",
38+
]
39+
40+
# Dynamically include all config params from the unmasking registry
41+
GENERATION_KWARGS_KEYS = [
42+
*_BASE_GENERATION_KWARGS_KEYS,
43+
*sorted(get_all_config_params() - set(_BASE_GENERATION_KWARGS_KEYS)),
44+
]
45+
46+
47+
def extract_rows_from_results(results_file: Path) -> list[dict]:
48+
"""Extract one row per task from a results JSON file."""
49+
with open(results_file, encoding="utf-8") as f:
50+
data = json.load(f)
51+
52+
model = data.get("model_name", data.get("config", {}).get("model", "unknown"))
53+
config = data.get("config", {})
54+
cli_generation_kwargs = config.get("gen_kwargs") or {}
55+
task_results = data.get("results", {})
56+
task_configs = data.get("configs", {})
57+
n_samples = data.get("n-samples", {})
58+
59+
rows = []
60+
for task_name, metrics in task_results.items():
61+
task_config = task_configs.get(task_name, {})
62+
task_generation_kwargs = task_config.get("generation_kwargs", {})
63+
merged_generation_kwargs = {**task_generation_kwargs, **cli_generation_kwargs}
64+
65+
row = {
66+
"model": model,
67+
"task": task_name,
68+
"results_file": str(results_file),
69+
}
70+
71+
for key in GENERATION_KWARGS_KEYS:
72+
row[key] = merged_generation_kwargs.get(key, "")
73+
74+
for metric in METRIC_KEYS:
75+
value = metrics.get(f"{metric},none", "")
76+
if value == "N/A":
77+
value = ""
78+
row[metric] = value
79+
80+
# Fallback: compute tokens_per_step from gen_kwargs if not in metrics
81+
if not row.get("tokens_per_step"):
82+
try:
83+
nfe = float(row["nfe"])
84+
max_tokens = int(row["max_tokens"])
85+
row["tokens_per_step"] = max_tokens / nfe if nfe > 0 else ""
86+
except (ValueError, TypeError):
87+
row["tokens_per_step"] = ""
88+
89+
# Compute k = max_tokens / steps (tokens unmasked per step)
90+
if not row.get("k"):
91+
try:
92+
max_tokens_val = int(row["max_tokens"])
93+
steps_val = int(row["steps"])
94+
row["k"] = max_tokens_val / steps_val if steps_val > 0 else ""
95+
except (ValueError, TypeError):
96+
row["k"] = ""
97+
98+
task_n_samples = n_samples.get(task_name, {})
99+
row["n_samples"] = task_n_samples.get("effective", "")
100+
101+
rows.append(row)
102+
103+
return rows
104+
105+
106+
def find_latest_result_files(results_dir: Path) -> list[Path]:
107+
"""Find the latest result file per (repr_param group, task) combination.
108+
109+
This is a file-level selection: when category-specific scripts produce
110+
results in different run directories under the same repr_param group,
111+
each task's latest file is selected independently.
112+
113+
Algorithm:
114+
1. Glob all results_*.json files under results_dir
115+
2. Group files by (grandparent path, filename) — i.e., (repr_param, task)
116+
3. Within each group, filter to files whose parent dir matches TIMESTAMP_DIR_RE
117+
4. If any timestamp dirs exist, pick the file from the lexicographically last one
118+
5. If NO timestamp dirs exist, fall back to the lexicographically last parent dir
119+
6. Return the list of selected result file paths
120+
"""
121+
all_results_files = list(results_dir.rglob("results_*.json"))
122+
if not all_results_files:
123+
return []
124+
125+
# Group by (repr_param dir, filename) so each task is resolved independently
126+
groups: dict[tuple[Path, str], list[Path]] = {}
127+
for results_file in all_results_files:
128+
run_dir = results_file.parent
129+
group_key = (run_dir.parent, results_file.name)
130+
if group_key not in groups:
131+
groups[group_key] = []
132+
groups[group_key].append(results_file)
133+
134+
selected_files: list[Path] = []
135+
for files in groups.values():
136+
timestamp_files = [f for f in files if TIMESTAMP_DIR_RE.match(f.parent.name)]
137+
if timestamp_files:
138+
selected_files.append(max(timestamp_files, key=lambda f: f.parent.name))
139+
else:
140+
selected_files.append(max(files, key=lambda f: f.parent.name))
141+
142+
return selected_files
143+
144+
145+
def collect_rows(results_dir: Path, sort_keys: list[str] | None = None) -> list[dict]:
146+
"""Scan results directory and collect all rows.
147+
148+
Matches both legacy timestamp filenames (results_2026-03-10T05-48-12.json)
149+
and new task-name filenames (results_parallelbench_waiting_line_copy.json).
150+
"""
151+
results_files = sorted(results_dir.rglob("results_*.json"))
152+
153+
if not results_files:
154+
logger.warning("No results found in %s", results_dir)
155+
return []
156+
157+
all_rows = []
158+
for results_file in results_files:
159+
try:
160+
rows = extract_rows_from_results(results_file)
161+
all_rows.extend(rows)
162+
except (json.JSONDecodeError, KeyError) as e:
163+
logger.warning("skipping %s: %s", results_file, e)
164+
165+
if sort_keys:
166+
167+
def sort_key(row):
168+
values = []
169+
for key in sort_keys:
170+
val = row.get(key, "")
171+
try:
172+
values.append((0, float(val)))
173+
except (ValueError, TypeError):
174+
values.append((1, str(val)))
175+
return values
176+
177+
all_rows.sort(key=sort_key)
178+
179+
return all_rows

parallelbench/cli/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@
2727
),
2828
"browse": ("Browse benchmark tasks and samples", "pb browse waiting_line/copy"),
2929
"analyze": ("Analyze evaluation results", "pb analyze leaderboard results/"),
30+
"export": (
31+
"Export results to GitHub Pages-compatible data files",
32+
"pb export --output ./gh-pages-data",
33+
),
3034
}
3135

3236

@@ -82,6 +86,10 @@ def main():
8286
from parallelbench.cli.analyze import main as analyze_main
8387

8488
analyze_main()
89+
elif command == "export":
90+
from parallelbench.cli.export import main as export_main
91+
92+
export_main()
8593
else:
8694
console.print(
8795
f"\n[bold red]Error:[/bold red] Unknown command [yellow]'{command}'[/yellow]"

0 commit comments

Comments
 (0)