Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions mempalace/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def sanitize_content(value: str, max_length: int = 100_000) -> str:

DEFAULT_PALACE_PATH = os.path.expanduser("~/.mempalace/palace")
DEFAULT_COLLECTION_NAME = "mempalace_drawers"
DEFAULT_EMBEDDING_MODEL = None # None = use ChromaDB built-in (all-MiniLM-L6-v2)

DEFAULT_TOPIC_WINGS = [
"emotions",
Expand Down Expand Up @@ -152,6 +153,21 @@ def collection_name(self):
"""ChromaDB collection name."""
return self._file_config.get("collection_name", DEFAULT_COLLECTION_NAME)

@property
def embedding_model(self):
"""SentenceTransformer model name for embeddings.

Set to a multilingual model (e.g. "paraphrase-multilingual-MiniLM-L12-v2")
to improve search quality for non-English content.
None means use ChromaDB's built-in default (all-MiniLM-L6-v2, English-only).

Can also be set via the MEMPALACE_EMBEDDING_MODEL environment variable.
"""
env_val = os.environ.get("MEMPALACE_EMBEDDING_MODEL")
if env_val:
return env_val or None
return self._file_config.get("embedding_model", DEFAULT_EMBEDDING_MODEL)

@property
def people_map(self):
"""Mapping of name variants to canonical names."""
Expand Down
35 changes: 33 additions & 2 deletions mempalace/palace.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import os
import chromadb
from mempalace.config import MempalaceConfig

SKIP_DIRS = {
".git",
Expand Down Expand Up @@ -34,6 +35,34 @@
}


def _get_embedding_function():
"""Return a ChromaDB embedding function based on config, or None for the default.

When ``embedding_model`` is set in ``~/.mempalace/config.json`` (or via the
``MEMPALACE_EMBEDDING_MODEL`` env var), a ``SentenceTransformerEmbeddingFunction``
is returned so that any HuggingFace sentence-transformers model can be used.
This is useful for non-English content — for example::

# ~/.mempalace/config.json
{"embedding_model": "paraphrase-multilingual-MiniLM-L12-v2"}

Returns ``None`` to fall back to ChromaDB's built-in ONNX model
(``all-MiniLM-L6-v2``), which is the default behaviour and requires no
extra dependencies.
"""
model_name = MempalaceConfig().embedding_model
if not model_name:
return None
try:
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
return SentenceTransformerEmbeddingFunction(model_name=model_name)
except ImportError:
raise ImportError(
f"embedding_model is set to '{model_name}' but the 'sentence-transformers' "
"package is not installed. Run: pip install sentence-transformers"
)


def get_collection(palace_path: str, collection_name: str = "mempalace_drawers"):
"""Get or create the palace ChromaDB collection."""
os.makedirs(palace_path, exist_ok=True)
Expand All @@ -42,10 +71,12 @@ def get_collection(palace_path: str, collection_name: str = "mempalace_drawers")
except (OSError, NotImplementedError):
pass
client = chromadb.PersistentClient(path=palace_path)
ef = _get_embedding_function()
kwargs = {"embedding_function": ef} if ef is not None else {}
try:
return client.get_collection(collection_name)
return client.get_collection(collection_name, **kwargs)
except Exception:
return client.create_collection(collection_name)
return client.create_collection(collection_name, **kwargs)


def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
Expand Down