Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 99 additions & 17 deletions mempalace/backends/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import datetime as _dt
import logging
import os
import pickle
import sqlite3
from pathlib import Path
from typing import Any, Optional
Expand Down Expand Up @@ -125,6 +126,63 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
return len(head) == 2 and head[0] == 0x80 and tail == b"\x2e"


class _BoundedUnpickler(pickle.Unpickler):
"""Unpickler that refuses any class instantiation.

``index_metadata.pickle`` written by chromadb 1.5.x is a plain dict of
builtins (``id_to_label`` is a ``dict``, ``total_elements_added`` is
an ``int``, etc.). Refusing ``find_class`` for anything else means a
structurally-valid-but-malicious pickle cannot execute arbitrary code
at quarantine-check time. Older 0.6.x palaces that stored the
metadata as a custom-class instance fall back to "unknown" — the
existing byte-sniff verdict stands.
"""

_ALLOWED_BUILTINS = frozenset(
{"dict", "int", "str", "tuple", "list", "set", "frozenset", "bytes", "bool"}
)

def find_class(self, module: str, name: str) -> Any:
if module == "builtins" and name in self._ALLOWED_BUILTINS:
return super().find_class(module, name)
raise pickle.UnpicklingError(f"Refusing to load {module}.{name}")


def _segment_pickle_counts_consistent(seg_dir: str) -> Optional[bool]:
"""Return True if HNSW pickle counts agree, False if they don't, None if undetermined.

Catches the off-by-one corruption shape from chroma-core/chroma#6979,
where ``total_elements_added`` is advanced past ``id_to_label`` entries
after a write killed mid-flush. HNSW traversal eventually steps on
the unmapped element and null-derefs (SIGSEGV — same shape diagnosed
at MemPalace/mempalace#1046).

The byte-sniff in :func:`_segment_appears_healthy` cannot detect this:
the pickle is structurally valid (PROTO marker present, terminator
present, file size large). The inconsistency lives in the values.

Returns ``None`` (rather than ``False``) when the file cannot be
parsed by :class:`_BoundedUnpickler`, the keys are missing, or types
are unexpected — those cases stay with the existing byte-sniff
verdict and are not used to trigger quarantine.
"""
meta_path = os.path.join(seg_dir, "index_metadata.pickle")
if not os.path.isfile(meta_path):
return None
try:
with open(meta_path, "rb") as f:
obj = _BoundedUnpickler(f).load()
except (pickle.UnpicklingError, OSError, EOFError, ValueError, AttributeError, KeyError):
return None
if not isinstance(obj, dict):
return None
id_to_label = obj.get("id_to_label")
total_added = obj.get("total_elements_added")
if not isinstance(id_to_label, dict) or not isinstance(total_added, int):
return None
return len(id_to_label) == total_added


def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
"""Rename HNSW segment dirs that are both stale-by-mtime AND fail an
integrity sniff-test.
Expand All @@ -135,14 +193,14 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
rebuild lazily on next open instead of segfaulting.

Two-stage check:
Three-stage check:

1. **mtime gate.** If ``chroma.sqlite3`` is less than
``stale_seconds`` newer than the segment's ``data_level0.bin``,
skip — chromadb is in normal write-path territory.

2. **Integrity gate** (``_segment_appears_healthy``). Even when the
mtime gap exceeds the threshold, a segment whose
2. **Byte-sniff integrity gate** (``_segment_appears_healthy``).
Even when the mtime gap exceeds the threshold, a segment whose
``index_metadata.pickle`` passes a format sniff-test is healthy:
chromadb 1.5.x flushes HNSW state asynchronously and a clean
shutdown does NOT force-flush, so the on-disk HNSW is *always*
Expand All @@ -155,8 +213,22 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
from vector recall until the operator runs ``mempalace repair
--mode rebuild`` (15+ min on a 151K palace).

Only segments that pass stage 1 (suspiciously stale) AND fail stage
2 (metadata file truncated, zero-filled, or absent-with-data) are
3. **Pickle-counts consistency gate**
(``_segment_pickle_counts_consistent``). When the byte-sniff
passes, deserialize the metadata via :class:`_BoundedUnpickler`
and compare ``len(id_to_label) == total_elements_added``. The
off-by-one corruption shape from chroma-core/chroma#6979 — a
write killed mid-flush leaves ``total_elements_added`` advanced
on disk while the corresponding ``id_to_label`` entry is never
written — produces a structurally-valid pickle that the byte-
sniff cannot detect. HNSW traversal eventually lands on the
unmapped label and null-derefs (SIGSEGV; same shape diagnosed
at #1046). When the unpickler refuses a non-builtin or the keys
are missing, returns "undetermined" and the byte-sniff verdict
stands — older 0.6.x palaces with attr-object metadata are not
quarantined on this gate alone.

Only segments that pass stage 1 AND fail stage 2 OR stage 3 are
renamed to ``<uuid>.drift-<timestamp>``. The original directory is
renamed, not deleted, so recovery remains possible if the heuristic
misfires.
Expand Down Expand Up @@ -210,25 +282,37 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
# state condition. A healthy segment metadata file proves
# chromadb can open the segment without segfault; don't
# quarantine a healthy index.
if _segment_appears_healthy(seg_dir):
logger.info(
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
"metadata file is intact — flush-lag, not corruption. "
"Leaving in place.",
sqlite_mtime - hnsw_mtime,
seg_dir,
)
continue
byte_sniff_ok = _segment_appears_healthy(seg_dir)
consistent = _segment_pickle_counts_consistent(seg_dir) if byte_sniff_ok else None
drift_reason: Optional[str] = None
if byte_sniff_ok:
if consistent is False:
# Stage 3: pickle parses but counts disagree. Off-by-one
# corruption from chroma-core/chroma#6979 — traversal
# eventually steps on an unmapped element and SIGSEGVs.
drift_reason = "id_to_label vs total_elements_added mismatch"
else:
logger.info(
"HNSW mtime gap %.0fs on %s exceeds threshold but segment "
"metadata file is intact — flush-lag, not corruption. "
"Leaving in place.",
sqlite_mtime - hnsw_mtime,
seg_dir,
)
continue
else:
drift_reason = "metadata pickle byte-sniff failed"

stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
target = f"{seg_dir}.drift-{stamp}"
try:
os.rename(seg_dir, target)
moved.append(target)
logger.warning(
"Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
"Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, %s); renamed to %s",
seg_dir,
sqlite_mtime - hnsw_mtime,
drift_reason,
target,
)
except OSError:
Expand Down Expand Up @@ -315,8 +399,6 @@ class _SafePersistentDataUnpickler:

@classmethod
def load(cls, path: str):
import pickle

class _Restricted(pickle.Unpickler):
def find_class(self, module: str, name: str):
if (module, name) in cls._ALLOWED:
Expand Down
119 changes: 119 additions & 0 deletions tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
ChromaCollection,
_fix_blob_seq_ids,
_pin_hnsw_threads,
_segment_pickle_counts_consistent,
quarantine_stale_hnsw,
)

Expand Down Expand Up @@ -675,6 +676,124 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
assert drift.exists()


# ── pickle-counts consistency gate (stage 3) ──────────────────────────────

import pickle as _pickle # noqa: E402 -- intentionally local to fixture helpers


def _write_meta_pickle(seg_path: Path, label_count: int, total_added: int) -> None:
"""Write a chromadb-1.5.x-shaped index_metadata.pickle."""
obj = {
"id_to_label": {f"id-{i}": i for i in range(label_count)},
"total_elements_added": total_added,
"dimensionality": None,
"max_seq_id": None,
"id_to_seq_id": {},
}
(seg_path / "index_metadata.pickle").write_bytes(_pickle.dumps(obj))


def test_segment_pickle_counts_consistent_matching(tmp_path):
"""Pickle with id_to_label count == total_elements_added returns True."""
seg = tmp_path / "seg"
seg.mkdir()
_write_meta_pickle(seg, label_count=10, total_added=10)
assert _segment_pickle_counts_consistent(str(seg)) is True


def test_segment_pickle_counts_consistent_off_by_one(tmp_path):
"""Pickle with total_elements_added one ahead of id_to_label returns
False — exact shape of the chroma-core/chroma#6979 corruption (write
killed mid-flush)."""
seg = tmp_path / "seg"
seg.mkdir()
_write_meta_pickle(seg, label_count=522_938, total_added=522_939)
assert _segment_pickle_counts_consistent(str(seg)) is False


def test_segment_pickle_counts_consistent_returns_none_for_class_pickle(tmp_path):
"""Pickle that requires class instantiation (e.g. 0.6.x attr-object
metadata format) returns None — the bounded unpickler refuses any
non-builtin module, so the existing byte-sniff verdict stands."""
import datetime as _datetime

seg = tmp_path / "seg"
seg.mkdir()
# datetime.date pickles via datetime._reconstruct which is not in
# _BoundedUnpickler._ALLOWED_BUILTINS — stand-in for any non-dict
# custom-class metadata format.
(seg / "index_metadata.pickle").write_bytes(_pickle.dumps(_datetime.date(2026, 1, 1)))
assert _segment_pickle_counts_consistent(str(seg)) is None


def test_segment_pickle_counts_consistent_returns_none_when_keys_missing(tmp_path):
"""Pickle missing one of the expected keys returns None — undetermined
rather than False, so the byte-sniff verdict stands."""
seg = tmp_path / "seg"
seg.mkdir()
(seg / "index_metadata.pickle").write_bytes(_pickle.dumps({"id_to_label": {}}))
assert _segment_pickle_counts_consistent(str(seg)) is None


def test_segment_pickle_counts_consistent_returns_none_when_file_missing(tmp_path):
"""No metadata file at all returns None."""
seg = tmp_path / "seg"
seg.mkdir()
assert _segment_pickle_counts_consistent(str(seg)) is None


def test_quarantine_stale_hnsw_quarantines_off_by_one_pickle(tmp_path):
"""Stale segment whose pickle parses but has off-by-one between
id_to_label and total_elements_added is quarantined — byte-sniff
alone misses this (file is structurally valid). Issue #1046 / chroma-
core/chroma#6979."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(
tmp_path,
hnsw_mtime=now - 7200,
sqlite_mtime=now,
meta_bytes=None, # we'll write a real pickle below
)
_write_meta_pickle(seg, label_count=99, total_added=100)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert len(moved) == 1
assert ".drift-" in moved[0]
assert not seg.exists()


def test_quarantine_stale_hnsw_leaves_consistent_pickle_alone(tmp_path):
"""Stale segment whose pickle parses with matching counts is left in
place — flush-lag, not corruption."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(
tmp_path,
hnsw_mtime=now - 7200,
sqlite_mtime=now,
meta_bytes=None,
)
_write_meta_pickle(seg, label_count=100, total_added=100)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == []
assert seg.exists()


def test_quarantine_stale_hnsw_leaves_unparseable_byte_healthy_alone(tmp_path):
"""Stale segment whose pickle passes byte-sniff but cannot be parsed
by the bounded unpickler stays untouched — undetermined verdict
falls back to the byte-sniff pass. Protects 0.6.x attr-object
palaces from regressing into a quarantine loop on this gate."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(
tmp_path,
hnsw_mtime=now - 7200,
sqlite_mtime=now,
meta_bytes=_HEALTHY_META, # passes byte-sniff but isn't a valid pickle
)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == []
assert seg.exists()


# ── make_client cold-start gate ──────────────────────────────────────────


Expand Down