Skip to content

Commit 6742244

Browse files
kAI ShtefanFabioLissiclaude
authored andcommitted
feat: init-time embedding model binding + multilingual support
Embedding model is bound to the palace at init time via collection metadata and changeable only through re-mine. Removes all embedding- related environment variables in favor of explicit CLI flags and auto-detection. Features: - `mempalace init --model <name>` binds embedding model at creation - `mempalace re-mine --model <new>` migrates to a different model - `mempalace init --chunk-size N` configures chunk size per palace - Auto-detect device: cuda > mps (arm64 only) > cpu - Embedding model mismatch detection with clear error messages - Legacy palace auto-migration (stamps "chromadb-default") - Backend seam abstraction for storage layer - MCP ping health checks Removed env vars: - MEMPALACE_EMBEDDING_MODEL (use --model flag) - MEMPALACE_EMBEDDING_DEVICE (auto-detected) - MEMPALACE_CHUNK_SIZE / MEMPALACE_CHUNK_OVERLAP (use --chunk-size) - MEMPALACE_FORCE_EMBEDDING (use --force flag) Benchmarked on 247 Russian blog posts (intfloat/multilingual-e5-base): 758 drawers, 49ms avg search, 0.719 avg similarity, 100% high-relevance Co-authored-by: Fabio Lissi <fabio.lissi@gmail.com> Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 122a5fd commit 6742244

14 files changed

Lines changed: 951 additions & 142 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,5 @@ venv/
3636

3737
# ChromaDB local data
3838
*.sqlite3-journal
39+
.venv-sandbox/
40+
MagicMock/

mempalace/backends/chroma.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,18 @@ def delete(self, **kwargs):
6767
def count(self):
6868
return self._collection.count()
6969

70+
@property
71+
def metadata(self):
72+
return self._collection.metadata
73+
74+
def modify(self, **kwargs):
75+
self._collection.modify(**kwargs)
76+
7077

7178
class ChromaBackend:
7279
"""Factory for MemPalace's default ChromaDB backend."""
7380

74-
def get_collection(self, palace_path: str, collection_name: str, create: bool = False):
81+
def get_collection(self, palace_path: str, collection_name: str, create: bool = False, embedding_function=None, metadata=None):
7582
if not create and not os.path.isdir(palace_path):
7683
raise FileNotFoundError(palace_path)
7784

@@ -84,8 +91,11 @@ def get_collection(self, palace_path: str, collection_name: str, create: bool =
8491

8592
_fix_blob_seq_ids(palace_path)
8693
client = chromadb.PersistentClient(path=palace_path)
94+
kwargs = {"embedding_function": embedding_function}
95+
if metadata is not None:
96+
kwargs["metadata"] = metadata
8797
if create:
88-
collection = client.get_or_create_collection(collection_name)
98+
collection = client.get_or_create_collection(collection_name, **kwargs)
8999
else:
90-
collection = client.get_collection(collection_name)
100+
collection = client.get_collection(collection_name, embedding_function=embedding_function)
91101
return ChromaCollection(collection)

mempalace/cli.py

Lines changed: 198 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333
import argparse
3434
from pathlib import Path
3535

36-
from .config import MempalaceConfig
36+
from .config import MempalaceConfig, read_collection_metadata
37+
from .palace import get_collection as _palace_get_collection
3738

3839

3940
def cmd_init(args):
@@ -51,7 +52,6 @@ def cmd_init(args):
5152
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
5253
if total > 0:
5354
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
54-
# Save confirmed entities to <project>/entities.json for the miner
5555
if confirmed["people"] or confirmed["projects"]:
5656
entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
5757
with open(entities_path, "w") as f:
@@ -62,7 +62,27 @@ def cmd_init(args):
6262

6363
# Pass 2: detect rooms from folder structure
6464
detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False))
65-
MempalaceConfig().init()
65+
66+
# Global config
67+
cfg = MempalaceConfig()
68+
cfg.init()
69+
70+
# Create collection with embedding model bound in metadata
71+
palace_path = os.path.expanduser(args.palace) if getattr(args, "palace", None) else cfg.palace_path
72+
model = getattr(args, "model", None) or "chromadb-default"
73+
chunk_size = getattr(args, "chunk_size", None)
74+
chunk_overlap = getattr(args, "chunk_overlap", None)
75+
76+
_palace_get_collection(
77+
palace_path,
78+
model=model,
79+
chunk_size=chunk_size,
80+
chunk_overlap=chunk_overlap,
81+
)
82+
print(f"\n Palace initialized: {palace_path}")
83+
print(f" Embedding model: {model}")
84+
if chunk_size:
85+
print(f" Chunk size: {chunk_size}")
6686

6787

6888
def cmd_mine(args):
@@ -133,8 +153,6 @@ def cmd_split(args):
133153
from .split_mega_files import main as split_main
134154
import sys
135155

136-
# Rebuild argv for split_mega_files argparse
137-
# Expand ~ and resolve to absolute path so split_mega_files sees a real path
138156
argv = ["--source", str(Path(args.dir).expanduser().resolve())]
139157
if args.output_dir:
140158
argv += ["--output-dir", args.output_dir]
@@ -163,9 +181,144 @@ def cmd_status(args):
163181
from .miner import status
164182

165183
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
184+
185+
col_meta = read_collection_metadata(palace_path)
186+
if col_meta:
187+
print(f"\n Palace config (from collection metadata):")
188+
print(f" Embedding model: {col_meta.get('embedding_model', 'unknown')}")
189+
if "chunk_size" in col_meta:
190+
print(f" Chunk size: {col_meta['chunk_size']}")
191+
if "chunk_overlap" in col_meta:
192+
print(f" Chunk overlap: {col_meta['chunk_overlap']}")
193+
166194
status(palace_path=palace_path)
167195

168196

197+
def _extract_source_files(palace_path: str) -> set:
198+
"""Extract all unique source_file paths from palace metadata."""
199+
from .palace import get_collection, iter_all_metadatas
200+
201+
try:
202+
col = get_collection(palace_path, force=True)
203+
except Exception:
204+
return set()
205+
206+
sources = set()
207+
for meta in iter_all_metadatas(col):
208+
sf = meta.get("source_file")
209+
if sf:
210+
sources.add(sf)
211+
return sources
212+
213+
214+
def cmd_remine(args):
215+
"""Re-mine palace with a new or current embedding model."""
216+
217+
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
218+
219+
if not os.path.isdir(palace_path):
220+
print(f"\n No palace found at {palace_path}")
221+
return
222+
223+
# Determine target model
224+
col_meta = read_collection_metadata(palace_path)
225+
new_model = getattr(args, "model", None)
226+
if new_model:
227+
target_model = new_model
228+
else:
229+
target_model = col_meta.get("embedding_model", "chromadb-default")
230+
231+
print(f"\n{'=' * 55}")
232+
print(" MemPalace Re-mine")
233+
print(f"{'=' * 55}\n")
234+
print(f" Palace: {palace_path}")
235+
print(f" Target model: {target_model}")
236+
if new_model and col_meta.get("embedding_model") and col_meta["embedding_model"] != new_model:
237+
print(f" Previous model: {col_meta['embedding_model']}")
238+
239+
# Step 1: Extract source files
240+
print("\n Extracting source file paths from existing drawers...")
241+
sources = _extract_source_files(palace_path)
242+
243+
if not sources:
244+
print(" No drawers found. Nothing to re-mine.")
245+
return
246+
247+
# Step 2: Partition into existing vs missing
248+
existing = {s for s in sources if os.path.isfile(s)}
249+
missing = sources - existing
250+
251+
print(f" Found {len(sources)} unique source files.")
252+
print(f" Still exist: {len(existing)}")
253+
print(f" Missing: {len(missing)}")
254+
255+
if missing:
256+
print("\n Missing files (will be skipped):")
257+
for f in sorted(missing)[:20]:
258+
print(f" - {f}")
259+
if len(missing) > 20:
260+
print(f" ... and {len(missing) - 20} more")
261+
262+
if not existing:
263+
print("\n No source files found on disk. Nothing to re-mine.")
264+
return
265+
266+
if args.dry_run:
267+
print(f"\n (dry run — would re-mine {len(existing)} files)")
268+
return
269+
270+
# Step 3: Backup palace before destructive operation
271+
import shutil
272+
import chromadb
273+
274+
backup_path = palace_path.rstrip(os.sep) + ".pre-remine-backup"
275+
if os.path.exists(backup_path):
276+
shutil.rmtree(backup_path)
277+
print(f"\n Backing up to {backup_path}...")
278+
shutil.copytree(palace_path, backup_path)
279+
280+
# Step 4: Drop and re-create with new model in metadata
281+
print(" Dropping existing collection...")
282+
client = chromadb.PersistentClient(path=palace_path)
283+
client.delete_collection("mempalace_drawers")
284+
285+
chunk_size = getattr(args, "chunk_size", None)
286+
chunk_overlap = getattr(args, "chunk_overlap", None)
287+
_palace_get_collection(
288+
palace_path,
289+
model=target_model,
290+
chunk_size=chunk_size,
291+
chunk_overlap=chunk_overlap,
292+
)
293+
print(f" Created collection with model: {target_model}")
294+
295+
# Step 5: Re-mine with exact file list
296+
print(" Re-mining...")
297+
298+
from .miner import mine
299+
from collections import defaultdict
300+
301+
dir_files = defaultdict(list)
302+
for f in sorted(existing):
303+
dir_files[os.path.dirname(f)].append(f)
304+
305+
for source_dir, file_list in sorted(dir_files.items()):
306+
if os.path.isdir(source_dir):
307+
print(f"\n Mining: {source_dir} ({len(file_list)} files)")
308+
mine(
309+
project_dir=source_dir,
310+
palace_path=palace_path,
311+
source_files=file_list,
312+
)
313+
314+
print(f"\n{'=' * 55}")
315+
print(f" Re-mine complete. Model: {target_model}")
316+
if missing:
317+
print(f" Skipped {len(missing)} missing source files.")
318+
print(f" Backup saved at {backup_path}")
319+
print(f"{'=' * 55}\n")
320+
321+
169322
def cmd_repair(args):
170323
"""Rebuild palace vector index from SQLite metadata."""
171324
import chromadb
@@ -189,10 +342,8 @@ def cmd_repair(args):
189342
print(f"{'=' * 55}\n")
190343
print(f" Palace: {palace_path}")
191344

192-
# Try to read existing drawers
193345
try:
194-
client = chromadb.PersistentClient(path=palace_path)
195-
col = client.get_collection("mempalace_drawers")
346+
col = _palace_get_collection(palace_path, force=True)
196347
total = col.count()
197348
print(f" Drawers found: {total}")
198349
except Exception as e:
@@ -239,8 +390,9 @@ def cmd_repair(args):
239390
shutil.copytree(palace_path, backup_path)
240391

241392
print(" Rebuilding collection...")
393+
client = chromadb.PersistentClient(path=palace_path)
242394
client.delete_collection("mempalace_drawers")
243-
new_col = client.create_collection("mempalace_drawers")
395+
new_col = _palace_get_collection(palace_path)
244396

245397
filed = 0
246398
for i in range(0, len(all_ids), batch_size):
@@ -293,12 +445,10 @@ def cmd_mcp(args):
293445

294446
def cmd_compress(args):
295447
"""Compress drawers in a wing using AAAK Dialect."""
296-
import chromadb
297448
from .dialect import Dialect
298449

299450
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
300451

301-
# Load dialect (with optional entity config)
302452
config_path = args.config
303453
if not config_path:
304454
for candidate in ["entities.json", os.path.join(palace_path, "entities.json")]:
@@ -312,16 +462,13 @@ def cmd_compress(args):
312462
else:
313463
dialect = Dialect()
314464

315-
# Connect to palace
316465
try:
317-
client = chromadb.PersistentClient(path=palace_path)
318-
col = client.get_collection("mempalace_drawers")
466+
col = _palace_get_collection(palace_path)
319467
except Exception:
320468
print(f"\n No palace found at {palace_path}")
321469
print(" Run: mempalace init <dir> then mempalace mine <dir>")
322470
sys.exit(1)
323471

324-
# Query drawers in batches to avoid SQLite variable limit (~999)
325472
where = {"wing": args.wing} if args.wing else None
326473
_BATCH = 500
327474
docs, metas, ids = [], [], []
@@ -383,10 +530,9 @@ def cmd_compress(args):
383530
print(f" {compressed}")
384531
print()
385532

386-
# Store compressed versions (unless dry-run)
387533
if not args.dry_run:
388534
try:
389-
comp_col = client.get_or_create_collection("mempalace_compressed")
535+
comp_col = _palace_get_collection(palace_path, "mempalace_compressed")
390536
for doc_id, compressed, meta, stats in compressed_entries:
391537
comp_meta = dict(meta)
392538
comp_meta["compression_ratio"] = round(stats["size_ratio"], 1)
@@ -403,9 +549,7 @@ def cmd_compress(args):
403549
print(f" Error storing compressed drawers: {e}")
404550
sys.exit(1)
405551

406-
# Summary
407552
ratio = total_original / max(total_compressed, 1)
408-
# Estimate tokens from char count (~3.8 chars/token for English text)
409553
orig_tokens = max(1, int(total_original / 3.8))
410554
comp_tokens = max(1, int(total_compressed / 3.8))
411555
print(f" Total: {orig_tokens:,}t -> {comp_tokens:,}t ({ratio:.1f}x compression)")
@@ -428,11 +572,23 @@ def main():
428572
sub = parser.add_subparsers(dest="command")
429573

430574
# init
431-
p_init = sub.add_parser("init", help="Detect rooms from your folder structure")
575+
p_init = sub.add_parser("init", help="Detect rooms and initialize palace config")
432576
p_init.add_argument("dir", help="Project directory to set up")
433577
p_init.add_argument(
434578
"--yes", action="store_true", help="Auto-accept all detected entities (non-interactive)"
435579
)
580+
p_init.add_argument(
581+
"--model", default=None,
582+
help="Embedding model to bind to this palace (default: chromadb-default)",
583+
)
584+
p_init.add_argument(
585+
"--chunk-size", type=int, default=None,
586+
help="Chunk size in characters (default: 450)",
587+
)
588+
p_init.add_argument(
589+
"--chunk-overlap", type=int, default=None,
590+
help="Chunk overlap in characters (default: 50)",
591+
)
436592

437593
# mine
438594
p_mine = sub.add_parser("mine", help="Mine files into the palace")
@@ -575,6 +731,27 @@ def main():
575731

576732
sub.add_parser("status", help="Show what's been filed")
577733

734+
# re-mine
735+
p_remine = sub.add_parser(
736+
"re-mine",
737+
help="Re-mine palace with a new embedding model",
738+
)
739+
p_remine.add_argument(
740+
"--model", default=None,
741+
help="New embedding model (default: keep current palace model)",
742+
)
743+
p_remine.add_argument(
744+
"--chunk-size", type=int, default=None,
745+
help="New chunk size in characters",
746+
)
747+
p_remine.add_argument(
748+
"--chunk-overlap", type=int, default=None,
749+
help="New chunk overlap in characters",
750+
)
751+
p_remine.add_argument(
752+
"--dry-run", action="store_true", help="Show what would be re-mined without doing it"
753+
)
754+
578755
args = parser.parse_args()
579756

580757
if not args.command:
@@ -609,6 +786,7 @@ def main():
609786
"repair": cmd_repair,
610787
"migrate": cmd_migrate,
611788
"status": cmd_status,
789+
"re-mine": cmd_remine,
612790
}
613791
dispatch[args.command](args)
614792

0 commit comments

Comments
 (0)