Skip to content

Commit 07d0c34

Browse files
SEA CAPITALclaude
authored andcommitted
feat: Evaluation Framework — quality metrics beyond Trust Score (Feature 5/7)
New evaluation.py — SEPARATE from Trust Score (no risk): - task_adherence: error rate + empty output rate → 0-100 - frustration_detection: 15 language patterns + repeated inputs → 0-100 - response_quality: generic response + short response detection → 0-100 - overall: weighted composite (adherence 50%, frustration 25%, quality 25%) Dashboard: /api/evaluation endpoint All rule-based pattern matching, no LLM dependency. Trust Score calculation UNCHANGED. 848 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent cdb600d commit 07d0c34

2 files changed

Lines changed: 162 additions & 0 deletions

File tree

sdk/python/atlast_ecp/dashboard_server.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,26 @@ def _dispatch_api(self, path: str, params: dict) -> dict:
115115
"active": get_active_incident(),
116116
}
117117

118+
# ── Evaluation ──
119+
elif path == "/api/evaluation":
120+
agent = params.get("agent", [None])[0]
121+
days = int(params.get("days", ["30"])[0])
122+
from .query import _ensure_index as _ei2, _get_db as _gdb2
123+
_ei2()
124+
from .evaluation import evaluate_records
125+
db3 = _gdb2()
126+
conds = ["1=1"]
127+
p2 = []
128+
if agent:
129+
conds.append("agent = ?")
130+
p2.append(agent)
131+
rows = db3.execute(
132+
"SELECT id, agent, ts, model, flags, input_preview, output_preview, error, is_infra FROM records WHERE %s ORDER BY ts DESC LIMIT 500" % " AND ".join(conds), p2
133+
).fetchall()
134+
db3.close()
135+
recs = [{"id":r[0],"agent":r[1],"ts":r[2],"model":r[3],"flags":r[4],"input_preview":r[5],"output_preview":r[6],"error":r[7],"is_infra":r[8]} for r in rows]
136+
return evaluate_records(recs)
137+
118138
# ── Clusters ──
119139
elif path == "/api/clusters":
120140
agent = params.get("agent", [None])[0]
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
"""
2+
ATLAST ECP — Evaluation Framework
3+
4+
Quality metrics beyond Trust Score:
5+
- task_adherence: did the agent follow instructions?
6+
- frustration_detection: user repeating/escalating?
7+
- response_quality: meaningful vs empty/generic?
8+
9+
All rule-based pattern matching. No LLM-as-judge.
10+
These are SEPARATE scores — they do NOT modify Trust Score.
11+
"""
12+
13+
import json
14+
import re
15+
from typing import Optional
16+
17+
18+
# ── Frustration Patterns ──
19+
FRUSTRATION_PATTERNS = [
20+
r"i already told you",
21+
r"i said",
22+
r"again[!\.]",
23+
r"this is wrong",
24+
r"that'?s not what i",
25+
r"you'?re not listening",
26+
r"please just",
27+
r"why can'?t you",
28+
r"stop doing",
29+
r"i don'?t want",
30+
r"not what i asked",
31+
r"try again",
32+
r"wrong answer",
33+
r"you keep",
34+
r"for the .* time",
35+
]
36+
37+
FRUSTRATION_RE = [re.compile(p, re.IGNORECASE) for p in FRUSTRATION_PATTERNS]
38+
39+
40+
def evaluate_records(records: list, threads: Optional[list] = None) -> dict:
41+
"""Evaluate records on multiple quality dimensions.
42+
43+
Returns: {
44+
task_adherence: {score, details},
45+
frustration: {score, details},
46+
response_quality: {score, details},
47+
overall: float (0-100),
48+
}
49+
"""
50+
if not records:
51+
return {"task_adherence": {"score": 100}, "frustration": {"score": 0},
52+
"response_quality": {"score": 100}, "overall": 100}
53+
54+
total = len(records)
55+
56+
# ── Task Adherence ──
57+
# Proxy: records with errors or empty outputs = failed tasks
58+
error_count = sum(1 for r in records if r.get("error"))
59+
empty_count = sum(1 for r in records
60+
if not (r.get("output_preview") or "").strip()
61+
and not (r.get("flags") or "").count("has_tool_calls"))
62+
adherence_failures = error_count + empty_count
63+
adherence_score = max(0, 100 - (adherence_failures / total * 100)) if total else 100
64+
65+
# ── Frustration Detection ──
66+
# Scan input_preview for frustration language
67+
frustration_count = 0
68+
frustration_examples = []
69+
for r in records:
70+
inp = r.get("input_preview", "") or ""
71+
for pat in FRUSTRATION_RE:
72+
if pat.search(inp):
73+
frustration_count += 1
74+
if len(frustration_examples) < 3:
75+
frustration_examples.append(inp[:80])
76+
break
77+
78+
# Also detect repetition: same input appearing > 2 times
79+
inputs = [r.get("input_preview", "")[:50] for r in records if r.get("input_preview")]
80+
from collections import Counter
81+
input_counts = Counter(inputs)
82+
repeated = sum(1 for _, c in input_counts.items() if c > 2)
83+
84+
frustration_score = min(100, (frustration_count + repeated * 2) / max(total, 1) * 100)
85+
86+
# ── Response Quality ──
87+
# Check for generic/low-quality responses
88+
generic_patterns = [
89+
r"^i'?m sorry",
90+
r"^as an ai",
91+
r"^i cannot",
92+
r"^i don'?t have access",
93+
r"^unfortunately",
94+
]
95+
generic_re = [re.compile(p, re.IGNORECASE) for p in generic_patterns]
96+
97+
generic_count = 0
98+
short_count = 0
99+
for r in records:
100+
out = (r.get("output_preview") or "").strip()
101+
if not out:
102+
continue
103+
# Check for generic responses
104+
for pat in generic_re:
105+
if pat.search(out):
106+
generic_count += 1
107+
break
108+
# Check for very short responses (< 20 chars)
109+
if 0 < len(out) < 20:
110+
short_count += 1
111+
112+
quality_issues = generic_count + short_count
113+
non_empty = sum(1 for r in records if (r.get("output_preview") or "").strip())
114+
quality_score = max(0, 100 - (quality_issues / max(non_empty, 1) * 100))
115+
116+
# ── Overall ──
117+
# Weighted average: adherence 50%, frustration inverted 25%, quality 25%
118+
overall = (adherence_score * 0.5 +
119+
max(0, 100 - frustration_score) * 0.25 +
120+
quality_score * 0.25)
121+
122+
return {
123+
"task_adherence": {
124+
"score": round(adherence_score, 1),
125+
"errors": error_count,
126+
"empty_outputs": empty_count,
127+
"total": total,
128+
},
129+
"frustration": {
130+
"score": round(frustration_score, 1),
131+
"frustration_count": frustration_count,
132+
"repeated_inputs": repeated,
133+
"examples": frustration_examples,
134+
},
135+
"response_quality": {
136+
"score": round(quality_score, 1),
137+
"generic_responses": generic_count,
138+
"short_responses": short_count,
139+
},
140+
"overall": round(overall, 1),
141+
"record_count": total,
142+
}

0 commit comments

Comments
 (0)