|
| 1 | +""" |
| 2 | +ATLAST ECP — Evaluation Framework |
| 3 | +
|
| 4 | +Quality metrics beyond Trust Score: |
| 5 | +- task_adherence: did the agent follow instructions? |
| 6 | +- frustration_detection: user repeating/escalating? |
| 7 | +- response_quality: meaningful vs empty/generic? |
| 8 | +
|
| 9 | +All rule-based pattern matching. No LLM-as-judge. |
| 10 | +These are SEPARATE scores — they do NOT modify Trust Score. |
| 11 | +""" |
| 12 | + |
| 13 | +import json |
| 14 | +import re |
| 15 | +from typing import Optional |
| 16 | + |
| 17 | + |
| 18 | +# ── Frustration Patterns ── |
| 19 | +FRUSTRATION_PATTERNS = [ |
| 20 | + r"i already told you", |
| 21 | + r"i said", |
| 22 | + r"again[!\.]", |
| 23 | + r"this is wrong", |
| 24 | + r"that'?s not what i", |
| 25 | + r"you'?re not listening", |
| 26 | + r"please just", |
| 27 | + r"why can'?t you", |
| 28 | + r"stop doing", |
| 29 | + r"i don'?t want", |
| 30 | + r"not what i asked", |
| 31 | + r"try again", |
| 32 | + r"wrong answer", |
| 33 | + r"you keep", |
| 34 | + r"for the .* time", |
| 35 | +] |
| 36 | + |
| 37 | +FRUSTRATION_RE = [re.compile(p, re.IGNORECASE) for p in FRUSTRATION_PATTERNS] |
| 38 | + |
| 39 | + |
| 40 | +def evaluate_records(records: list, threads: Optional[list] = None) -> dict: |
| 41 | + """Evaluate records on multiple quality dimensions. |
| 42 | +
|
| 43 | + Returns: { |
| 44 | + task_adherence: {score, details}, |
| 45 | + frustration: {score, details}, |
| 46 | + response_quality: {score, details}, |
| 47 | + overall: float (0-100), |
| 48 | + } |
| 49 | + """ |
| 50 | + if not records: |
| 51 | + return {"task_adherence": {"score": 100}, "frustration": {"score": 0}, |
| 52 | + "response_quality": {"score": 100}, "overall": 100} |
| 53 | + |
| 54 | + total = len(records) |
| 55 | + |
| 56 | + # ── Task Adherence ── |
| 57 | + # Proxy: records with errors or empty outputs = failed tasks |
| 58 | + error_count = sum(1 for r in records if r.get("error")) |
| 59 | + empty_count = sum(1 for r in records |
| 60 | + if not (r.get("output_preview") or "").strip() |
| 61 | + and not (r.get("flags") or "").count("has_tool_calls")) |
| 62 | + adherence_failures = error_count + empty_count |
| 63 | + adherence_score = max(0, 100 - (adherence_failures / total * 100)) if total else 100 |
| 64 | + |
| 65 | + # ── Frustration Detection ── |
| 66 | + # Scan input_preview for frustration language |
| 67 | + frustration_count = 0 |
| 68 | + frustration_examples = [] |
| 69 | + for r in records: |
| 70 | + inp = r.get("input_preview", "") or "" |
| 71 | + for pat in FRUSTRATION_RE: |
| 72 | + if pat.search(inp): |
| 73 | + frustration_count += 1 |
| 74 | + if len(frustration_examples) < 3: |
| 75 | + frustration_examples.append(inp[:80]) |
| 76 | + break |
| 77 | + |
| 78 | + # Also detect repetition: same input appearing > 2 times |
| 79 | + inputs = [r.get("input_preview", "")[:50] for r in records if r.get("input_preview")] |
| 80 | + from collections import Counter |
| 81 | + input_counts = Counter(inputs) |
| 82 | + repeated = sum(1 for _, c in input_counts.items() if c > 2) |
| 83 | + |
| 84 | + frustration_score = min(100, (frustration_count + repeated * 2) / max(total, 1) * 100) |
| 85 | + |
| 86 | + # ── Response Quality ── |
| 87 | + # Check for generic/low-quality responses |
| 88 | + generic_patterns = [ |
| 89 | + r"^i'?m sorry", |
| 90 | + r"^as an ai", |
| 91 | + r"^i cannot", |
| 92 | + r"^i don'?t have access", |
| 93 | + r"^unfortunately", |
| 94 | + ] |
| 95 | + generic_re = [re.compile(p, re.IGNORECASE) for p in generic_patterns] |
| 96 | + |
| 97 | + generic_count = 0 |
| 98 | + short_count = 0 |
| 99 | + for r in records: |
| 100 | + out = (r.get("output_preview") or "").strip() |
| 101 | + if not out: |
| 102 | + continue |
| 103 | + # Check for generic responses |
| 104 | + for pat in generic_re: |
| 105 | + if pat.search(out): |
| 106 | + generic_count += 1 |
| 107 | + break |
| 108 | + # Check for very short responses (< 20 chars) |
| 109 | + if 0 < len(out) < 20: |
| 110 | + short_count += 1 |
| 111 | + |
| 112 | + quality_issues = generic_count + short_count |
| 113 | + non_empty = sum(1 for r in records if (r.get("output_preview") or "").strip()) |
| 114 | + quality_score = max(0, 100 - (quality_issues / max(non_empty, 1) * 100)) |
| 115 | + |
| 116 | + # ── Overall ── |
| 117 | + # Weighted average: adherence 50%, frustration inverted 25%, quality 25% |
| 118 | + overall = (adherence_score * 0.5 + |
| 119 | + max(0, 100 - frustration_score) * 0.25 + |
| 120 | + quality_score * 0.25) |
| 121 | + |
| 122 | + return { |
| 123 | + "task_adherence": { |
| 124 | + "score": round(adherence_score, 1), |
| 125 | + "errors": error_count, |
| 126 | + "empty_outputs": empty_count, |
| 127 | + "total": total, |
| 128 | + }, |
| 129 | + "frustration": { |
| 130 | + "score": round(frustration_score, 1), |
| 131 | + "frustration_count": frustration_count, |
| 132 | + "repeated_inputs": repeated, |
| 133 | + "examples": frustration_examples, |
| 134 | + }, |
| 135 | + "response_quality": { |
| 136 | + "score": round(quality_score, 1), |
| 137 | + "generic_responses": generic_count, |
| 138 | + "short_responses": short_count, |
| 139 | + }, |
| 140 | + "overall": round(overall, 1), |
| 141 | + "record_count": total, |
| 142 | + } |
0 commit comments