3333import argparse
3434from pathlib import Path
3535
36- from .config import MempalaceConfig
36+ from .config import MempalaceConfig , read_collection_metadata
37+ from .palace import get_collection as _palace_get_collection
3738
3839
3940def cmd_init (args ):
@@ -51,7 +52,6 @@ def cmd_init(args):
5152 total = len (detected ["people" ]) + len (detected ["projects" ]) + len (detected ["uncertain" ])
5253 if total > 0 :
5354 confirmed = confirm_entities (detected , yes = getattr (args , "yes" , False ))
54- # Save confirmed entities to <project>/entities.json for the miner
5555 if confirmed ["people" ] or confirmed ["projects" ]:
5656 entities_path = Path (args .dir ).expanduser ().resolve () / "entities.json"
5757 with open (entities_path , "w" ) as f :
@@ -62,7 +62,27 @@ def cmd_init(args):
6262
6363 # Pass 2: detect rooms from folder structure
6464 detect_rooms_local (project_dir = args .dir , yes = getattr (args , "yes" , False ))
65- MempalaceConfig ().init ()
65+
66+ # Global config
67+ cfg = MempalaceConfig ()
68+ cfg .init ()
69+
70+ # Create collection with embedding model bound in metadata
71+ palace_path = os .path .expanduser (args .palace ) if getattr (args , "palace" , None ) else cfg .palace_path
72+ model = getattr (args , "model" , None ) or "chromadb-default"
73+ chunk_size = getattr (args , "chunk_size" , None )
74+ chunk_overlap = getattr (args , "chunk_overlap" , None )
75+
76+ _palace_get_collection (
77+ palace_path ,
78+ model = model ,
79+ chunk_size = chunk_size ,
80+ chunk_overlap = chunk_overlap ,
81+ )
82+ print (f"\n Palace initialized: { palace_path } " )
83+ print (f" Embedding model: { model } " )
84+ if chunk_size :
85+ print (f" Chunk size: { chunk_size } " )
6686
6787
6888def cmd_mine (args ):
@@ -133,8 +153,6 @@ def cmd_split(args):
133153 from .split_mega_files import main as split_main
134154 import sys
135155
136- # Rebuild argv for split_mega_files argparse
137- # Expand ~ and resolve to absolute path so split_mega_files sees a real path
138156 argv = ["--source" , str (Path (args .dir ).expanduser ().resolve ())]
139157 if args .output_dir :
140158 argv += ["--output-dir" , args .output_dir ]
@@ -163,9 +181,144 @@ def cmd_status(args):
163181 from .miner import status
164182
165183 palace_path = os .path .expanduser (args .palace ) if args .palace else MempalaceConfig ().palace_path
184+
185+ col_meta = read_collection_metadata (palace_path )
186+ if col_meta :
187+ print (f"\n Palace config (from collection metadata):" )
188+ print (f" Embedding model: { col_meta .get ('embedding_model' , 'unknown' )} " )
189+ if "chunk_size" in col_meta :
190+ print (f" Chunk size: { col_meta ['chunk_size' ]} " )
191+ if "chunk_overlap" in col_meta :
192+ print (f" Chunk overlap: { col_meta ['chunk_overlap' ]} " )
193+
166194 status (palace_path = palace_path )
167195
168196
197+ def _extract_source_files (palace_path : str ) -> set :
198+ """Extract all unique source_file paths from palace metadata."""
199+ from .palace import get_collection , iter_all_metadatas
200+
201+ try :
202+ col = get_collection (palace_path , force = True )
203+ except Exception :
204+ return set ()
205+
206+ sources = set ()
207+ for meta in iter_all_metadatas (col ):
208+ sf = meta .get ("source_file" )
209+ if sf :
210+ sources .add (sf )
211+ return sources
212+
213+
214+ def cmd_remine (args ):
215+ """Re-mine palace with a new or current embedding model."""
216+
217+ palace_path = os .path .expanduser (args .palace ) if args .palace else MempalaceConfig ().palace_path
218+
219+ if not os .path .isdir (palace_path ):
220+ print (f"\n No palace found at { palace_path } " )
221+ return
222+
223+ # Determine target model
224+ col_meta = read_collection_metadata (palace_path )
225+ new_model = getattr (args , "model" , None )
226+ if new_model :
227+ target_model = new_model
228+ else :
229+ target_model = col_meta .get ("embedding_model" , "chromadb-default" )
230+
231+ print (f"\n { '=' * 55 } " )
232+ print (" MemPalace Re-mine" )
233+ print (f"{ '=' * 55 } \n " )
234+ print (f" Palace: { palace_path } " )
235+ print (f" Target model: { target_model } " )
236+ if new_model and col_meta .get ("embedding_model" ) and col_meta ["embedding_model" ] != new_model :
237+ print (f" Previous model: { col_meta ['embedding_model' ]} " )
238+
239+ # Step 1: Extract source files
240+ print ("\n Extracting source file paths from existing drawers..." )
241+ sources = _extract_source_files (palace_path )
242+
243+ if not sources :
244+ print (" No drawers found. Nothing to re-mine." )
245+ return
246+
247+ # Step 2: Partition into existing vs missing
248+ existing = {s for s in sources if os .path .isfile (s )}
249+ missing = sources - existing
250+
251+ print (f" Found { len (sources )} unique source files." )
252+ print (f" Still exist: { len (existing )} " )
253+ print (f" Missing: { len (missing )} " )
254+
255+ if missing :
256+ print ("\n Missing files (will be skipped):" )
257+ for f in sorted (missing )[:20 ]:
258+ print (f" - { f } " )
259+ if len (missing ) > 20 :
260+ print (f" ... and { len (missing ) - 20 } more" )
261+
262+ if not existing :
263+ print ("\n No source files found on disk. Nothing to re-mine." )
264+ return
265+
266+ if args .dry_run :
267+ print (f"\n (dry run — would re-mine { len (existing )} files)" )
268+ return
269+
270+ # Step 3: Backup palace before destructive operation
271+ import shutil
272+ import chromadb
273+
274+ backup_path = palace_path .rstrip (os .sep ) + ".pre-remine-backup"
275+ if os .path .exists (backup_path ):
276+ shutil .rmtree (backup_path )
277+ print (f"\n Backing up to { backup_path } ..." )
278+ shutil .copytree (palace_path , backup_path )
279+
280+ # Step 4: Drop and re-create with new model in metadata
281+ print (" Dropping existing collection..." )
282+ client = chromadb .PersistentClient (path = palace_path )
283+ client .delete_collection ("mempalace_drawers" )
284+
285+ chunk_size = getattr (args , "chunk_size" , None )
286+ chunk_overlap = getattr (args , "chunk_overlap" , None )
287+ _palace_get_collection (
288+ palace_path ,
289+ model = target_model ,
290+ chunk_size = chunk_size ,
291+ chunk_overlap = chunk_overlap ,
292+ )
293+ print (f" Created collection with model: { target_model } " )
294+
295+ # Step 5: Re-mine with exact file list
296+ print (" Re-mining..." )
297+
298+ from .miner import mine
299+ from collections import defaultdict
300+
301+ dir_files = defaultdict (list )
302+ for f in sorted (existing ):
303+ dir_files [os .path .dirname (f )].append (f )
304+
305+ for source_dir , file_list in sorted (dir_files .items ()):
306+ if os .path .isdir (source_dir ):
307+ print (f"\n Mining: { source_dir } ({ len (file_list )} files)" )
308+ mine (
309+ project_dir = source_dir ,
310+ palace_path = palace_path ,
311+ source_files = file_list ,
312+ )
313+
314+ print (f"\n { '=' * 55 } " )
315+ print (f" Re-mine complete. Model: { target_model } " )
316+ if missing :
317+ print (f" Skipped { len (missing )} missing source files." )
318+ print (f" Backup saved at { backup_path } " )
319+ print (f"{ '=' * 55 } \n " )
320+
321+
169322def cmd_repair (args ):
170323 """Rebuild palace vector index from SQLite metadata."""
171324 import chromadb
@@ -189,10 +342,8 @@ def cmd_repair(args):
189342 print (f"{ '=' * 55 } \n " )
190343 print (f" Palace: { palace_path } " )
191344
192- # Try to read existing drawers
193345 try :
194- client = chromadb .PersistentClient (path = palace_path )
195- col = client .get_collection ("mempalace_drawers" )
346+ col = _palace_get_collection (palace_path , force = True )
196347 total = col .count ()
197348 print (f" Drawers found: { total } " )
198349 except Exception as e :
@@ -239,8 +390,9 @@ def cmd_repair(args):
239390 shutil .copytree (palace_path , backup_path )
240391
241392 print (" Rebuilding collection..." )
393+ client = chromadb .PersistentClient (path = palace_path )
242394 client .delete_collection ("mempalace_drawers" )
243- new_col = client . create_collection ( "mempalace_drawers" )
395+ new_col = _palace_get_collection ( palace_path )
244396
245397 filed = 0
246398 for i in range (0 , len (all_ids ), batch_size ):
@@ -293,12 +445,10 @@ def cmd_mcp(args):
293445
294446def cmd_compress (args ):
295447 """Compress drawers in a wing using AAAK Dialect."""
296- import chromadb
297448 from .dialect import Dialect
298449
299450 palace_path = os .path .expanduser (args .palace ) if args .palace else MempalaceConfig ().palace_path
300451
301- # Load dialect (with optional entity config)
302452 config_path = args .config
303453 if not config_path :
304454 for candidate in ["entities.json" , os .path .join (palace_path , "entities.json" )]:
@@ -312,16 +462,13 @@ def cmd_compress(args):
312462 else :
313463 dialect = Dialect ()
314464
315- # Connect to palace
316465 try :
317- client = chromadb .PersistentClient (path = palace_path )
318- col = client .get_collection ("mempalace_drawers" )
466+ col = _palace_get_collection (palace_path )
319467 except Exception :
320468 print (f"\n No palace found at { palace_path } " )
321469 print (" Run: mempalace init <dir> then mempalace mine <dir>" )
322470 sys .exit (1 )
323471
324- # Query drawers in batches to avoid SQLite variable limit (~999)
325472 where = {"wing" : args .wing } if args .wing else None
326473 _BATCH = 500
327474 docs , metas , ids = [], [], []
@@ -383,10 +530,9 @@ def cmd_compress(args):
383530 print (f" { compressed } " )
384531 print ()
385532
386- # Store compressed versions (unless dry-run)
387533 if not args .dry_run :
388534 try :
389- comp_col = client . get_or_create_collection ( "mempalace_compressed" )
535+ comp_col = _palace_get_collection ( palace_path , "mempalace_compressed" )
390536 for doc_id , compressed , meta , stats in compressed_entries :
391537 comp_meta = dict (meta )
392538 comp_meta ["compression_ratio" ] = round (stats ["size_ratio" ], 1 )
@@ -403,9 +549,7 @@ def cmd_compress(args):
403549 print (f" Error storing compressed drawers: { e } " )
404550 sys .exit (1 )
405551
406- # Summary
407552 ratio = total_original / max (total_compressed , 1 )
408- # Estimate tokens from char count (~3.8 chars/token for English text)
409553 orig_tokens = max (1 , int (total_original / 3.8 ))
410554 comp_tokens = max (1 , int (total_compressed / 3.8 ))
411555 print (f" Total: { orig_tokens :,} t -> { comp_tokens :,} t ({ ratio :.1f} x compression)" )
@@ -428,11 +572,23 @@ def main():
428572 sub = parser .add_subparsers (dest = "command" )
429573
430574 # init
431- p_init = sub .add_parser ("init" , help = "Detect rooms from your folder structure " )
575+ p_init = sub .add_parser ("init" , help = "Detect rooms and initialize palace config " )
432576 p_init .add_argument ("dir" , help = "Project directory to set up" )
433577 p_init .add_argument (
434578 "--yes" , action = "store_true" , help = "Auto-accept all detected entities (non-interactive)"
435579 )
580+ p_init .add_argument (
581+ "--model" , default = None ,
582+ help = "Embedding model to bind to this palace (default: chromadb-default)" ,
583+ )
584+ p_init .add_argument (
585+ "--chunk-size" , type = int , default = None ,
586+ help = "Chunk size in characters (default: 450)" ,
587+ )
588+ p_init .add_argument (
589+ "--chunk-overlap" , type = int , default = None ,
590+ help = "Chunk overlap in characters (default: 50)" ,
591+ )
436592
437593 # mine
438594 p_mine = sub .add_parser ("mine" , help = "Mine files into the palace" )
@@ -575,6 +731,27 @@ def main():
575731
576732 sub .add_parser ("status" , help = "Show what's been filed" )
577733
734+ # re-mine
735+ p_remine = sub .add_parser (
736+ "re-mine" ,
737+ help = "Re-mine palace with a new embedding model" ,
738+ )
739+ p_remine .add_argument (
740+ "--model" , default = None ,
741+ help = "New embedding model (default: keep current palace model)" ,
742+ )
743+ p_remine .add_argument (
744+ "--chunk-size" , type = int , default = None ,
745+ help = "New chunk size in characters" ,
746+ )
747+ p_remine .add_argument (
748+ "--chunk-overlap" , type = int , default = None ,
749+ help = "New chunk overlap in characters" ,
750+ )
751+ p_remine .add_argument (
752+ "--dry-run" , action = "store_true" , help = "Show what would be re-mined without doing it"
753+ )
754+
578755 args = parser .parse_args ()
579756
580757 if not args .command :
@@ -609,6 +786,7 @@ def main():
609786 "repair" : cmd_repair ,
610787 "migrate" : cmd_migrate ,
611788 "status" : cmd_status ,
789+ "re-mine" : cmd_remine ,
612790 }
613791 dispatch [args .command ](args )
614792
0 commit comments