From 66f928a539d79553661a0f7c1e2596b9d1d19b99 Mon Sep 17 00:00:00 2001 From: Vas S Date: Wed, 29 Apr 2026 11:31:07 -0500 Subject: [PATCH] fix: persistent daemon to prevent zombie processes and enable multi-session access (closes #1229) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces a daemon + bridge architecture so all AI agent sessions share a single long-lived MemPalace process rather than each spawning their own MCP server. This eliminates two failure modes described in #1229: 1. Zombie processes: SIGKILL bypasses Python atexit/trap cleanup, leaving stale PID files that block every subsequent session from connecting. The daemon outlives any individual session; only the bridge (a 60-line relay) dies with the session. 2. Concurrent ChromaDB writer corruption: multiple PersistentClient holders racing on the HNSW mmap files cause the sqlite metadata and in-memory index to diverge (see also #1222). The daemon serialises all tools/call requests through a threading.Lock, giving a single-writer guarantee. Files added under examples/: - mempalace-daemon.py — persistent Unix socket MCP server (LaunchAgent target) - mempalace-bridge.py — lightweight stdio<->socket relay (MCP command per session) - com.mempalace.daemon.plist — macOS LaunchAgent template; auto-restarts on crash Docs added: - docs/multi-session-daemon.md — problem description, architecture, install steps, and MCP config examples for Claude Code, Codex, Gemini CLI, and generic clients. No changes to the core mempalace package or mcp_server.py — the daemon imports handle_request() directly, making this purely additive. Tested on macOS 14/15, Python 3.11/3.12, MemPalace 3.3.x with four concurrent sessions (Claude Code + Codex + Gemini CLI + GG). Co-Authored-By: Claude Sonnet 4.6 --- docs/multi-session-daemon.md | 218 ++++++++++++++++++++++++++++ examples/com.mempalace.daemon.plist | 26 ++++ examples/mempalace-bridge.py | 100 +++++++++++++ examples/mempalace-daemon.py | 123 ++++++++++++++++ 4 files changed, 467 insertions(+) create mode 100644 docs/multi-session-daemon.md create mode 100644 examples/com.mempalace.daemon.plist create mode 100644 examples/mempalace-bridge.py create mode 100644 examples/mempalace-daemon.py diff --git a/docs/multi-session-daemon.md b/docs/multi-session-daemon.md new file mode 100644 index 000000000..d75e2da42 --- /dev/null +++ b/docs/multi-session-daemon.md @@ -0,0 +1,218 @@ +# Multi-Session MCP Daemon — Persistent MemPalace for All Your AI Agents + +**Fixes:** #1229 — zombie MCP server processes blocking sessions and corrupting ChromaDB + +## The Problem + +The default `mempalace-mcp` command (which calls `mempalace.mcp_server`) is designed +to run as one process per agent session. When you run Claude Code, Codex, Gemini CLI, +and GG simultaneously, each session spawns its own Python process — and each process +holds an open ChromaDB `PersistentClient`. + +This creates two compounding failure modes: + +### 1. Zombie processes after SIGKILL + +MCP host applications (Claude Desktop, VS Code, terminal multiplexers) sometimes +force-quit sessions with `SIGKILL` rather than `SIGTERM`. Python's `atexit` handlers +and `signal.signal(SIGTERM, ...)` cleanup traps never fire on `SIGKILL`. The result +is a Python process that exits without releasing its file descriptor on +`~/.mempalace/mcp-server.pid`. + +On the next session start, a PID-file guard sees a stale PID file, decides another +instance is running, and refuses to start. The session connects to nothing, and every +MCP tool call returns "Connection closed". + +Removing the stale PID file by hand is the only recovery — until it happens again. + +### 2. Concurrent ChromaDB writers corrupt HNSW + +ChromaDB's HNSW index uses memory-mapped segment files. When two processes both hold +a `PersistentClient` against the same `chroma.sqlite3` and both call `upsert()`, the +writes interleave at the mmap level. This causes the in-memory HNSW tree and the on-disk +sqlite metadata to diverge — exactly the divergence issue #1222 documents and detects. + +The `hnsw_capacity_status` probe (introduced in #1222) can detect this after the fact, +but it cannot prevent it. The only safe fix is ensuring **a single process owns the +ChromaDB connection** at all times. + +## The Solution: Daemon + Bridge Architecture + +Instead of each session spawning its own `mcp_server.py` process, a single **daemon** +process runs continuously (managed by macOS LaunchAgent), holds the ChromaDB connection, +and serves all sessions over a Unix socket. Each agent session runs a tiny **bridge** +script that relays its stdio to/from the daemon socket. + +``` +macOS LaunchAgent + └── mempalace-daemon.py (one process, holds ChromaDB, listens on ~/.mempalace/mcp.sock) + ├── Claude Code session ←→ mempalace-bridge.py ←→ socket + ├── Codex session ←→ mempalace-bridge.py ←→ socket + ├── Gemini CLI session ←→ mempalace-bridge.py ←→ socket + └── GG session ←→ mempalace-bridge.py ←→ socket +``` + +**Benefits:** + +- **No zombie problem.** If a session is SIGKILL'd, only the bridge dies. The daemon + keeps running; the socket stays open; the next session connects immediately. +- **No concurrent writer corruption.** All `tools/call` requests are serialised through + a single `threading.Lock` inside the daemon. Protocol messages (`initialize`, + `tools/list`, `ping`) are lock-free for speed. +- **Auto-start on first use.** The bridge detects a missing socket and starts the + daemon automatically, so you don't have to think about it. +- **LaunchAgent keeps it alive.** If the daemon crashes (e.g. OOM), launchd restarts + it within `ThrottleInterval` seconds (default: 5). + +## Files + +Three files are provided in the `examples/` directory: + +| File | Purpose | +|------|---------| +| `mempalace-daemon.py` | The persistent server process — run once via LaunchAgent | +| `mempalace-bridge.py` | Per-session stdio relay — this is the MCP command you configure | +| `com.mempalace.daemon.plist` | macOS LaunchAgent template — edit paths, then `launchctl load` | + +## Installation + +### Step 1 — Copy the scripts + +```bash +# Copy to wherever you keep your local tooling. +# The bridge looks for the daemon script relative to its own location, +# so keep both files in the same directory. +cp examples/mempalace-daemon.py ~/bin/mempalace-daemon.py +cp examples/mempalace-bridge.py ~/bin/mempalace-bridge.py +chmod +x ~/bin/mempalace-daemon.py ~/bin/mempalace-bridge.py +``` + +### Step 2 — Edit the LaunchAgent plist + +Open `examples/com.mempalace.daemon.plist` and replace: + +- `/path/to/mempalace/venv/bin/python` — the Python interpreter in your MemPalace + virtualenv (run `which python` inside the venv to get the path) +- `/path/to/mempalace-daemon.py` — the absolute path where you copied the daemon script +- `YOUR_USERNAME` — your macOS username (`echo $USER`) + +### Step 3 — Install and load the LaunchAgent + +```bash +cp examples/com.mempalace.daemon.plist ~/Library/LaunchAgents/com.mempalace.daemon.plist +launchctl load ~/Library/LaunchAgents/com.mempalace.daemon.plist +``` + +Verify it started: + +```bash +# The socket should exist within a few seconds +ls -la ~/.mempalace/mcp.sock + +# Check the log +tail -f ~/.mempalace/daemon.log +``` + +### Step 4 — Configure each agent to use the bridge + +Replace every `mempalace-mcp` / `mempalace.mcp_server` invocation with the bridge. + +#### Claude Code (`~/.claude.json`) + +```json +{ + "mcpServers": { + "mempalace": { + "type": "stdio", + "command": "/absolute/path/to/python", + "args": ["/absolute/path/to/mempalace-bridge.py"] + } + } +} +``` + +Or register via CLI: + +```bash +claude mcp add mempalace -- /path/to/python /path/to/mempalace-bridge.py +``` + +#### Codex (`~/.codex/config.toml`) + +```toml +[mcp_servers.mempalace] +command = "/absolute/path/to/python" +args = ["/absolute/path/to/mempalace-bridge.py"] +``` + +#### Gemini CLI (`~/.gemini/settings.json`) + +```json +{ + "mcpServers": { + "mempalace": { + "command": "/absolute/path/to/python", + "args": ["/absolute/path/to/mempalace-bridge.py"] + } + } +} +``` + +Or via CLI: + +```bash +gemini mcp add mempalace /absolute/path/to/python /absolute/path/to/mempalace-bridge.py --scope user +``` + +#### Any other stdio MCP client + +The bridge is a generic stdio relay. Any client that accepts a `command` + `args` +MCP configuration can use it: + +``` +command: /path/to/python +args: ["/path/to/mempalace-bridge.py"] +``` + +## Troubleshooting + +### "MemPalace daemon not reachable" + +The bridge tried 20 times (5 seconds total) to connect and failed. + +1. Check the daemon log: `tail ~/.mempalace/daemon.log` +2. Check launchd status: `launchctl list | grep mempalace` +3. Try starting the daemon manually to see startup errors: + ```bash + /path/to/python /path/to/mempalace-daemon.py + ``` + +### LaunchAgent not starting + +- Verify plist syntax: `plutil ~/Library/LaunchAgents/com.mempalace.daemon.plist` +- Reload: `launchctl unload ~/Library/LaunchAgents/com.mempalace.daemon.plist && launchctl load ~/Library/LaunchAgents/com.mempalace.daemon.plist` +- Check Console.app for launchd errors. + +### Stale socket from a previous crash + +If the daemon crashed without cleanup, the socket file may still exist but be dead. +The daemon removes a stale socket automatically on startup (`SOCK_PATH.unlink()` before +`server.bind()`), so restarting the daemon is sufficient: + +```bash +launchctl kickstart -k gui/$(id -u)/com.mempalace.daemon +``` + +### Reverting to the single-process mode + +Remove or unload the LaunchAgent, then update your MCP configurations back to +`mempalace-mcp` or `python -m mempalace.mcp_server`. The daemon and bridge are +additive — they do not modify the core `mcp_server.py`. + +## Tested on + +- macOS 14 Sonoma +- macOS 15 Sequoia +- Python 3.11 / 3.12 +- MemPalace 3.3.x (ChromaDB 0.6.x) +- Concurrent sessions: Claude Code + Codex + Gemini CLI + GG diff --git a/examples/com.mempalace.daemon.plist b/examples/com.mempalace.daemon.plist new file mode 100644 index 000000000..7eaa6776f --- /dev/null +++ b/examples/com.mempalace.daemon.plist @@ -0,0 +1,26 @@ + + + + + Label + com.mempalace.daemon + ProgramArguments + + + /path/to/mempalace/venv/bin/python + /path/to/mempalace-daemon.py + + RunAtLoad + + KeepAlive + + ThrottleInterval + 5 + StandardOutPath + /Users/YOUR_USERNAME/.mempalace/daemon.log + StandardErrorPath + /Users/YOUR_USERNAME/.mempalace/daemon.log + WorkingDirectory + /Users/YOUR_USERNAME + + diff --git a/examples/mempalace-bridge.py b/examples/mempalace-bridge.py new file mode 100644 index 000000000..931f0cd7c --- /dev/null +++ b/examples/mempalace-bridge.py @@ -0,0 +1,100 @@ +""" +mempalace-bridge.py — Lightweight stdio<->socket relay for MemPalace. + +Configure each AI agent (Claude Code, Codex, Gemini CLI, etc.) to run this +script as the MCP command instead of mempalace.mcp_server directly. +It auto-starts the daemon on first use. + +MCP config example (Claude Code ~/.claude.json): + "mempalace": { + "type": "stdio", + "command": "/path/to/python", + "args": ["/path/to/mempalace-bridge.py"] + } +""" +import os +import socket +import subprocess +import sys +import threading +import time +from pathlib import Path + +PALACE_DIR = Path(os.environ.get("MEMPALACE_PALACE", Path.home() / ".mempalace")) +SOCK_PATH = PALACE_DIR / "mcp.sock" +DAEMON_PYTHON = sys.executable +DAEMON_SCRIPT = str(Path(__file__).parent / "mempalace-daemon.py") + + +def _start_daemon(): + PALACE_DIR.mkdir(parents=True, exist_ok=True) + log_path = PALACE_DIR / "daemon.log" + subprocess.Popen( + [DAEMON_PYTHON, DAEMON_SCRIPT], + stdout=open(str(log_path), "a"), + stderr=subprocess.STDOUT, + close_fds=True, + start_new_session=True, + ) + + +def _connect(retries: int = 20, delay: float = 0.25) -> socket.socket: + for i in range(retries): + try: + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect(str(SOCK_PATH)) + return s + except (FileNotFoundError, ConnectionRefusedError): + if i == 0: + _start_daemon() + time.sleep(delay) + raise RuntimeError(f"MemPalace daemon not reachable at {SOCK_PATH}") + + +def main(): + try: + sock = _connect() + except RuntimeError as e: + print(f"[mempalace-bridge] {e}", file=sys.stderr) + sys.exit(1) + + stop = threading.Event() + + def pump_in(): + try: + while not stop.is_set(): + chunk = sys.stdin.buffer.read1(65536) + if not chunk: + break + sock.sendall(chunk) + except Exception: + pass + finally: + stop.set() + try: + sock.shutdown(socket.SHUT_WR) + except Exception: + pass + + def pump_out(): + try: + while not stop.is_set(): + chunk = sock.recv(65536) + if not chunk: + break + sys.stdout.buffer.write(chunk) + sys.stdout.buffer.flush() + except Exception: + pass + finally: + stop.set() + + t_in = threading.Thread(target=pump_in, daemon=True) + t_out = threading.Thread(target=pump_out, daemon=True) + t_in.start() + t_out.start() + t_out.join() + + +if __name__ == "__main__": + main() diff --git a/examples/mempalace-daemon.py b/examples/mempalace-daemon.py new file mode 100644 index 000000000..b5c4dee71 --- /dev/null +++ b/examples/mempalace-daemon.py @@ -0,0 +1,123 @@ +""" +mempalace-daemon.py — Persistent MemPalace Unix socket MCP server. + +Runs ONCE (managed by macOS LaunchAgent). All Claude Code and Codex sessions +connect via mempalace-bridge.py (a stdio<->socket relay). This eliminates the +single-slot problem: every session gets MemPalace access simultaneously. + +Architecture: + LaunchAgent -> mempalace-daemon.py (this file, one process, holds ChromaDB) + Per session -> mempalace-bridge.py -> Unix socket -> this daemon + +Single-writer guarantee: all tools/call requests go through _tool_lock, which +serializes ChromaDB writes. Protocol messages (initialize, tools/list, ping) +are lock-free. +""" +import fcntl +import json +import logging +import os +import signal +import socket +import sys +import threading +from pathlib import Path + +PALACE_DIR = Path(os.environ.get("MEMPALACE_PALACE", Path.home() / ".mempalace")) +SOCK_PATH = PALACE_DIR / "mcp.sock" +LOCK_PATH = PALACE_DIR / ".daemon.lock" +LOG_PATH = PALACE_DIR / "daemon.log" + +PALACE_DIR.mkdir(parents=True, exist_ok=True) +logging.basicConfig( + filename=str(LOG_PATH), + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", +) +log = logging.getLogger("mempalace-daemon") + +_lock_fd = open(LOCK_PATH, "w") +try: + fcntl.flock(_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) +except (BlockingIOError, OSError): + log.error("Daemon already running — exiting.") + sys.exit(1) + +from mempalace.mcp_server import handle_request # noqa: E402 + +_tool_lock = threading.Lock() + + +def _handle_client(conn: socket.socket, client_id: int): + log.info(f"[client-{client_id}] connected") + f_in = conn.makefile("rb") + try: + for raw in f_in: + raw = raw.strip() + if not raw: + continue + try: + request = json.loads(raw) + except json.JSONDecodeError: + continue + method = request.get("method") or "" + if method == "tools/call": + with _tool_lock: + response = handle_request(request) + else: + response = handle_request(request) + if response is not None: + conn.sendall((json.dumps(response) + "\n").encode()) + except (ConnectionResetError, BrokenPipeError): + pass + except Exception: + log.exception(f"[client-{client_id}] handler error") + finally: + try: + f_in.close() + except Exception: + pass + try: + conn.close() + except Exception: + pass + log.info(f"[client-{client_id}] disconnected") + + +def main(): + if SOCK_PATH.exists(): + SOCK_PATH.unlink() + server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind(str(SOCK_PATH)) + SOCK_PATH.chmod(0o600) + server.listen(32) + log.info(f"MemPalace daemon listening on {SOCK_PATH}") + print(f"[mempalace-daemon] Listening on {SOCK_PATH}", flush=True) + + def _shutdown(sig, frame): + log.info("Shutting down.") + server.close() + if SOCK_PATH.exists(): + SOCK_PATH.unlink() + sys.exit(0) + + signal.signal(signal.SIGTERM, _shutdown) + signal.signal(signal.SIGINT, _shutdown) + + client_id = 0 + try: + while True: + conn, _ = server.accept() + client_id += 1 + t = threading.Thread(target=_handle_client, args=(conn, client_id), daemon=True) + t.start() + except OSError: + pass + finally: + if SOCK_PATH.exists(): + SOCK_PATH.unlink() + + +if __name__ == "__main__": + main()