inference-stack-llc
diff --git a/‎src/electripy/ai/rag_eval_runner/README.md‎
Lines changed: 129 additions & 0 deletions b/‎src/electripy/ai/rag_eval_runner/README.md‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎src/electripy/ai/rag_eval_runner/__init__.py‎
Lines changed: 25 additions & 0 deletions b/‎src/electripy/ai/rag_eval_runner/__init__.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/electripy/ai/rag_eval_runner/adapters.py‎
Lines changed: 120 additions & 0 deletions b/‎src/electripy/ai/rag_eval_runner/adapters.py‎
Lines changed: 120 additions & 0 deletions
@@ -0,0 +1,129 @@
+# RAG Evaluation Runner
+
+The RAG Evaluation Runner provides a small, framework-agnostic pipeline
+for benchmarking retrieval quality across different chunking and
+embedding configurations using simple JSONL datasets.
+
+It is designed for **Python power users** who want to:
+
+- Point the runner at a corpus and queries file.
+- Configure chunking and embedding variants.
+- Run matrix experiments and collect metrics.
+
+## Dataset format
+
+Both corpus and queries are JSONL files. Blank lines and lines starting
+with `#` are ignored.
+
+### Corpus JSONL
+
+Each line is a JSON object with the following fields:
+
+- `id` (str, required)
+- `text` (str, required)
+- `source_uri` (str, optional)
+- `metadata` (object, optional)
+
+Example:
+
+```json
+{"id": "doc-1", "text": "Hello world", "source_uri": "memory://", "metadata": {"topic": "greeting"}}
+```
+
+### Queries JSONL
+
+Each line is a JSON object with the following fields:
+
+- `id` (str, required)
+- `query` (str, required)
+- `relevant_ids` (list[str], required) – chunk ids considered relevant
+- `metadata` (object, optional)
+
+Example:
+
+```json
+{"id": "q1", "query": "hello", "relevant_ids": ["doc-1:0"]}
+```
+
+## Experiment matrix
+
+An evaluation run describes a matrix of experiments:
+
+- `chunk_variants` – different `ChunkingConfig` settings.
+- `embedder_variants` – logical embedders (for example `"fake"`, `"openai"`).
+- `top_k_values` – list of cut-off ranks.
+
+The runner expands these into experiments:
+
+```text
+experiments = chunk_variants × embedder_variants × top_k_values
+```
+
+Each experiment produces:
+
+- Aggregate metrics per `k`.
+- Optional per-query breakdown.
+
+## Metrics
+
+The runner computes the following metrics for each `k`:
+
+- Hit rate@k – fraction of queries with at least one relevant chunk in
+  the top-`k` results.
+- Precision@k – macro-averaged precision.
+- Recall@k – macro-averaged recall.
+- MRR@k – mean reciprocal rank.
+
+Metrics are computed using the deterministic utilities from
+`electripy.ai.rag.evaluation` plus a local MRR implementation.
+
+## CLI usage
+
+A Typer-based CLI command is exposed as:
+
+```bash
+electripy rag eval --corpus corpus.jsonl --queries queries.jsonl \
+  --top-k 3,5,10 --chunk-size 500 --chunk-overlap 100 --embedder fake \
+  --report-json out.json --report-csv out.csv
+```
+
+Key options:
+
+- `--corpus PATH` – corpus JSONL file.
+- `--queries PATH` – queries JSONL file.
+- `--top-k 3,5,10` – comma-separated list of cut-offs.
+- `--chunk-size` / `--chunk-overlap` – basic chunking config.
+- `--chunker-config PATH` – optional JSON file for advanced chunking
+  configs; takes precedence over `--chunk-size` / `--chunk-overlap`.
+- `--embedder` – one or more embedders (for example `"fake"`),
+  optionally as a comma-separated list.
+- `--report-json` / `--report-csv` – report output paths.
+- `--fail-under` – thresholds such as `hit_rate@5=0.85`.
+
+## Determinism and reproducibility
+
+- Fake embeddings are deterministic functions of the input text.
+- The in-memory vector store uses cosine similarity with deterministic
+  tie-breaking on chunk id.
+- Experiments are expanded in a stable order.
+- Experiment ids are computed as SHA-256 hashes of the configuration.
+
+## Extensibility
+
+To plug in a custom chunker or embedder, implement the existing RAG
+ports (`ChunkerPort`, `EmbeddingPort`, `VectorStorePort`) and wire them
+into your own orchestration, or extend the helpers in
+`electripy.ai.rag_eval_runner.services`.
+
+In particular:
+
+- Add a new embedder via `EmbedderVariant` and extend the
+  `_build_default_embedding_port` helper.
+- Swap the vector store by providing an implementation of
+  `VectorStorePort` instead of `InMemoryVectorStoreAdapter`.
+
+## CI gating with `--fail-under`
+
+The CLI exposes `--fail-under <metric@k=value>` to make evaluations
+suitable for CI. Thresholds must be met by **all** experiments; otherwise
+an error is raised and the process exits with a non-zero status.
@@ -0,0 +1,25 @@
+"""RAG evaluation runner component.
+
+High-level exports for dataset models, experiment configuration, and
+orchestration services.
+"""
+
+from __future__ import annotations
+
+from .domain import CorpusRecord, ExperimentConfig, QueryRecord
+from .errors import DatasetFormatError, EvalRunnerError, ExperimentConfigError, RagEvalError
+from .services import DatasetLoader, Evaluator, IndexBuilder, ReportWriter
+
+__all__ = [
+    "CorpusRecord",
+    "QueryRecord",
+    "ExperimentConfig",
+    "DatasetLoader",
+    "IndexBuilder",
+    "Evaluator",
+    "ReportWriter",
+    "RagEvalError",
+    "DatasetFormatError",
+    "ExperimentConfigError",
+    "EvalRunnerError",
+]
@@ -0,0 +1,120 @@
+"""Adapters and fakes for the RAG evaluation runner.
+
+This module provides:
+
+- ``FakeEmbeddingAdapter`` – deterministic, stateless embeddings derived
+  from text hashing, suitable for tests and offline runs.
+- ``InMemoryVectorStoreAdapter`` – simple in-memory vector store using
+  cosine similarity with deterministic tie-breaking.
+
+Both adapters implement the ports defined in :mod:`electripy.ai.rag` and
+are intentionally minimal to keep dependencies small and behaviour
+predictable.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import math
+from collections.abc import Mapping, Sequence
+
+from electripy.ai.rag.domain import Chunk
+from electripy.ai.rag.ports import EmbeddingPort, VectorStorePort
+
+
+class FakeEmbeddingAdapter(EmbeddingPort):
+    """Deterministic embedding adapter based on SHA-256 hashing.
+
+    The adapter produces fixed-size embedding vectors whose components
+    are derived from the SHA-256 digest of the input text. The mapping
+    is purely functional and does not involve any randomness, making it
+    suitable for reproducible tests.
+
+    Example:
+        >>> adapter = FakeEmbeddingAdapter()
+        >>> vectors = adapter.embed_texts(["hello", "world"])
+        >>> len(vectors) == 2
+        True
+    """
+
+    def __init__(self, *, dim: int = 16) -> None:
+        if dim <= 0:
+            raise ValueError("dim must be positive")
+        self._dim = dim
+
+    def embed_texts(self, texts: Sequence[str]) -> list[list[float]]:
+        if not texts:
+            return []
+        return [self._embed_single(text) for text in texts]
+
+    def _embed_single(self, text: str) -> list[float]:
+        digest = hashlib.sha256(text.encode("utf-8")).digest()
+        # Use bytes from the digest to populate the vector deterministically.
+        values: list[float] = []
+        for i in range(self._dim):
+            # Wrap around the digest if needed.
+            b = digest[i % len(digest)]
+            # Map byte to [-0.5, 0.5] and then scale.
+            values.append((float(b) / 255.0) - 0.5)
+        # L2-normalise to keep cosine similarity well-behaved.
+        norm = math.sqrt(sum(v * v for v in values)) or 1.0
+        return [v / norm for v in values]
+
+
+class InMemoryVectorStoreAdapter(VectorStorePort):
+    """In-memory vector store implementing :class:`VectorStorePort`.
+
+    Notes:
+        - Stores vectors in process memory only; suitable for tests and
+          local evaluation runs.
+        - Uses cosine similarity for ranking and breaks ties
+          deterministically by chunk id.
+    """
+
+    def __init__(self) -> None:
+        self._store: dict[str, tuple[Chunk, list[float]]] = {}
+
+    def upsert(self, chunks: Sequence[Chunk], vectors: Sequence[list[float]]) -> None:
+        if len(chunks) != len(vectors):
+            raise ValueError("chunks and vectors must have the same length")
+        for chunk, vector in zip(chunks, vectors):
+            self._store[chunk.id] = (chunk, list(vector))
+
+    def query(
+        self,
+        vector: Sequence[float],
+        *,
+        top_k: int,
+        filters: Mapping[str, object] | None = None,
+    ) -> list[tuple[Chunk, float]]:
+        if top_k <= 0:
+            raise ValueError("top_k must be positive")
+        if not self._store:
+            return []
+
+        # For now, filters are ignored; they are present to satisfy the
+        # protocol and keep a future extension point.
+        del filters
+
+        norm_q = math.sqrt(sum(float(v) * float(v) for v in vector)) or 1.0
+        scores: list[tuple[Chunk, float]] = []
+        for chunk_id, (chunk, stored_vec) in self._store.items():
+            dot = 0.0
+            norm_v = 0.0
+            for a, b in zip(vector, stored_vec):
+                fa = float(a)
+                fb = float(b)
+                dot += fa * fb
+                norm_v += fb * fb
+            norm_v = math.sqrt(norm_v) or 1.0
+            score = dot / (norm_q * norm_v)
+            scores.append((chunk, score))
+
+        # Deterministic ordering: sort by descending score, then chunk id.
+        scores.sort(key=lambda item: (-item[1], item[0].id))
+        return scores[:top_k]
+
+    def delete_by_document(self, document_id: str) -> None:
+        to_delete = [cid for cid, (chunk, _) in self._store.items() if chunk.document_id == document_id]
+        for cid in to_delete:
+            self._store.pop(cid, None)