Skip to content

Commit c57aca0

Browse files
FabioLissikAI Shtefan
authored andcommitted
feat: add mempalace re-mine command
Extracts source file paths from existing palace metadata, drops the collection, and re-mines with the currently configured embedding model. This is the recommended recovery path when intentionally switching models. Supports --dry-run to preview what would be re-mined. Reports missing source files (deleted since original mining).
1 parent 732a559 commit c57aca0

3 files changed

Lines changed: 243 additions & 6 deletions

File tree

mempalace/cli.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,119 @@ def cmd_status(args):
166166
status(palace_path=palace_path)
167167

168168

169+
def _extract_source_files(palace_path: str) -> set:
170+
"""Extract all unique source_file paths from palace metadata."""
171+
from .palace import get_collection, iter_all_metadatas
172+
173+
try:
174+
col = get_collection(palace_path, force=True)
175+
except Exception:
176+
return set()
177+
178+
sources = set()
179+
for meta in iter_all_metadatas(col):
180+
sf = meta.get("source_file")
181+
if sf:
182+
sources.add(sf)
183+
return sources
184+
185+
186+
def cmd_remine(args):
187+
"""Re-mine palace with the currently configured embedding model."""
188+
from .config import get_embedding_model_name
189+
190+
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
191+
192+
if not os.path.isdir(palace_path):
193+
print(f"\n No palace found at {palace_path}")
194+
return
195+
196+
print(f"\n{'=' * 55}")
197+
print(" MemPalace Re-mine")
198+
print(f"{'=' * 55}\n")
199+
print(f" Palace: {palace_path}")
200+
print(f" Target model: {get_embedding_model_name()}")
201+
202+
# Step 1: Extract source files
203+
print("\n Extracting source file paths from existing drawers...")
204+
sources = _extract_source_files(palace_path)
205+
206+
if not sources:
207+
print(" No drawers found. Nothing to re-mine.")
208+
return
209+
210+
# Step 2: Partition into existing vs missing
211+
existing = {s for s in sources if os.path.isfile(s)}
212+
missing = sources - existing
213+
214+
print(f" Found {len(sources)} unique source files.")
215+
print(f" Still exist: {len(existing)}")
216+
print(f" Missing: {len(missing)}")
217+
218+
if missing:
219+
print("\n Missing files (will be skipped):")
220+
for f in sorted(missing)[:20]:
221+
print(f" - {f}")
222+
if len(missing) > 20:
223+
print(f" ... and {len(missing) - 20} more")
224+
225+
if not existing:
226+
print("\n No source files found on disk. Nothing to re-mine.")
227+
return
228+
229+
if args.dry_run:
230+
print(f"\n (dry run — would re-mine {len(existing)} files)")
231+
return
232+
233+
# Step 3: Backup palace before destructive operation
234+
import shutil
235+
import chromadb
236+
237+
backup_path = palace_path.rstrip(os.sep) + ".pre-remine-backup"
238+
if os.path.exists(backup_path):
239+
shutil.rmtree(backup_path)
240+
print(f"\n Backing up to {backup_path}...")
241+
shutil.copytree(palace_path, backup_path)
242+
243+
# Step 4: Drop and re-create
244+
print(" Dropping existing collection...")
245+
client = chromadb.PersistentClient(path=palace_path)
246+
client.delete_collection("mempalace_drawers")
247+
248+
# Step 5: Re-mine with exact file list (not directory scan)
249+
# NOTE: Files originally mined with --mode convos will be re-mined
250+
# as project files (fixed-size chunks instead of exchange-pair chunks).
251+
# The vectors will be correct for the new model, but chunk boundaries
252+
# will differ. A future improvement could store the original mining
253+
# mode in drawer metadata to preserve it across re-mines.
254+
print(" Re-mining...")
255+
256+
from .miner import mine
257+
258+
# Group files by parent dir (mine() needs a project_dir for config)
259+
from collections import defaultdict
260+
261+
dir_files = defaultdict(list)
262+
for f in sorted(existing):
263+
dir_files[os.path.dirname(f)].append(f)
264+
265+
for source_dir, file_list in sorted(dir_files.items()):
266+
if os.path.isdir(source_dir):
267+
print(f"\n Mining: {source_dir} ({len(file_list)} files)")
268+
mine(
269+
project_dir=source_dir,
270+
palace_path=palace_path,
271+
source_files=file_list,
272+
)
273+
274+
print(f"\n{'=' * 55}")
275+
print(f" Re-mine complete. Model: {get_embedding_model_name()}")
276+
if missing:
277+
print(f" Skipped {len(missing)} missing source files.")
278+
print(f" Backup saved at {backup_path}")
279+
print(f"{'=' * 55}\n")
280+
281+
169282
def cmd_repair(args):
170283
"""Rebuild palace vector index from SQLite metadata."""
171284
import chromadb
@@ -551,6 +664,15 @@ def main():
551664

552665
sub.add_parser("status", help="Show what's been filed")
553666

667+
# re-mine
668+
p_remine = sub.add_parser(
669+
"re-mine",
670+
help="Re-mine palace with the currently configured embedding model",
671+
)
672+
p_remine.add_argument(
673+
"--dry-run", action="store_true", help="Show what would be re-mined without doing it"
674+
)
675+
554676
args = parser.parse_args()
555677

556678
if not args.command:
@@ -585,6 +707,7 @@ def main():
585707
"repair": cmd_repair,
586708
"migrate": cmd_migrate,
587709
"status": cmd_status,
710+
"re-mine": cmd_remine,
588711
}
589712
dispatch[args.command](args)
590713

mempalace/miner.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -544,20 +544,29 @@ def mine(
544544
dry_run: bool = False,
545545
respect_gitignore: bool = True,
546546
include_ignored: list = None,
547+
source_files: list = None,
547548
):
548-
"""Mine a project directory into the palace."""
549+
"""Mine a project directory into the palace.
550+
551+
If *source_files* is provided, only those files are mined (skipping
552+
the directory scan). This is used by ``mempalace re-mine`` to
553+
re-embed exactly the files that were in the palace before.
554+
"""
549555

550556
project_path = Path(project_dir).expanduser().resolve()
551557
config = load_config(project_dir)
552558

553559
wing = wing_override or config["wing"]
554560
rooms = config.get("rooms", [{"name": "general", "description": "All project files"}])
555561

556-
files = scan_project(
557-
project_dir,
558-
respect_gitignore=respect_gitignore,
559-
include_ignored=include_ignored,
560-
)
562+
if source_files is not None:
563+
files = [Path(f) for f in source_files if os.path.isfile(f)]
564+
else:
565+
files = scan_project(
566+
project_dir,
567+
respect_gitignore=respect_gitignore,
568+
include_ignored=include_ignored,
569+
)
561570
if limit > 0:
562571
files = files[:limit]
563572

tests/test_remine.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
"""Tests for the mempalace re-mine command."""
2+
3+
import os
4+
from unittest.mock import MagicMock
5+
6+
import pytest
7+
8+
from mempalace.palace import get_collection
9+
10+
11+
@pytest.fixture(autouse=True)
12+
def reset_embedding_cache():
13+
import mempalace.config as cfg_mod
14+
cfg_mod._embedding_function = None
15+
cfg_mod._embedding_function_resolved = False
16+
yield
17+
cfg_mod._embedding_function = None
18+
cfg_mod._embedding_function_resolved = False
19+
20+
21+
@pytest.fixture
22+
def populated_palace(tmp_path, monkeypatch):
23+
monkeypatch.delenv("MEMPALACE_EMBEDDING_MODEL", raising=False)
24+
monkeypatch.delenv("MEMPALACE_EMBEDDING_DEVICE", raising=False)
25+
26+
palace_path = str(tmp_path / "palace")
27+
source_dir = tmp_path / "source"
28+
source_dir.mkdir()
29+
30+
(source_dir / "file1.txt").write_text("Hello world content for file one")
31+
(source_dir / "file2.txt").write_text("Second file with different content")
32+
33+
col = get_collection(palace_path)
34+
col.add(
35+
documents=["Hello world content for file one"],
36+
ids=["drawer-1"],
37+
metadatas=[{
38+
"wing": "test",
39+
"room": "general",
40+
"source_file": str(source_dir / "file1.txt"),
41+
}],
42+
)
43+
col.add(
44+
documents=["Second file with different content"],
45+
ids=["drawer-2"],
46+
metadatas=[{
47+
"wing": "test",
48+
"room": "general",
49+
"source_file": str(source_dir / "file2.txt"),
50+
}],
51+
)
52+
53+
return {"palace_path": palace_path, "source_dir": source_dir, "col": col}
54+
55+
56+
class TestRemineExtractSources:
57+
def test_extracts_unique_source_files(self, populated_palace):
58+
from mempalace.cli import _extract_source_files
59+
sources = _extract_source_files(populated_palace["palace_path"])
60+
assert len(sources) == 2
61+
assert str(populated_palace["source_dir"] / "file1.txt") in sources
62+
assert str(populated_palace["source_dir"] / "file2.txt") in sources
63+
64+
def test_empty_palace_returns_empty(self, tmp_path, monkeypatch):
65+
monkeypatch.delenv("MEMPALACE_EMBEDDING_MODEL", raising=False)
66+
monkeypatch.delenv("MEMPALACE_EMBEDDING_DEVICE", raising=False)
67+
palace_path = str(tmp_path / "palace")
68+
get_collection(palace_path)
69+
from mempalace.cli import _extract_source_files
70+
sources = _extract_source_files(palace_path)
71+
assert sources == set()
72+
73+
74+
class TestRemineDryRun:
75+
def test_dry_run_reports_counts(self, populated_palace, capsys):
76+
from mempalace.cli import cmd_remine
77+
args = MagicMock()
78+
args.palace = populated_palace["palace_path"]
79+
args.dry_run = True
80+
cmd_remine(args)
81+
output = capsys.readouterr().out
82+
assert "2" in output
83+
assert "dry run" in output.lower()
84+
85+
def test_dry_run_does_not_drop_collection(self, populated_palace):
86+
from mempalace.cli import cmd_remine
87+
args = MagicMock()
88+
args.palace = populated_palace["palace_path"]
89+
args.dry_run = True
90+
cmd_remine(args)
91+
col = get_collection(populated_palace["palace_path"])
92+
assert col.count() == 2
93+
94+
95+
class TestRemineMissingFiles:
96+
def test_reports_missing_files(self, populated_palace, capsys):
97+
from mempalace.cli import cmd_remine
98+
os.remove(populated_palace["source_dir"] / "file2.txt")
99+
args = MagicMock()
100+
args.palace = populated_palace["palace_path"]
101+
args.dry_run = True
102+
cmd_remine(args)
103+
output = capsys.readouterr().out
104+
assert "1" in output # 1 still exists
105+
assert "missing" in output.lower() or "Missing" in output

0 commit comments

Comments
 (0)