Merge pull request #1234 from MemPalace/feat/normalize-gemini-cli

igorls · web-flow · commit fdfaf017abd5 · 2026-04-27T20:42:06.000-03:00
feat(normalize): Gemini CLI session JSONL adapter
diff --git a/mempalace/normalize.py b/mempalace/normalize.py
@@ -8,6 +8,7 @@
     - ChatGPT conversations.json
     - Claude Code JSONL (with tool_use/tool_result block capture)
     - OpenAI Codex CLI JSONL
+    - Gemini CLI JSONL (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl)
     - Slack JSON export
     - Plain text (pass through for paragraph chunking)
 
@@ -157,6 +158,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
     if normalized:
         return normalized
 
+    normalized = _try_gemini_jsonl(content)
+    if normalized:
+        return normalized
+
     try:
         data = json.loads(content)
     except json.JSONDecodeError:
@@ -280,6 +285,74 @@ def _try_codex_jsonl(content: str) -> Optional[str]:
     return None
 
 
+def _try_gemini_jsonl(content: str) -> Optional[str]:
+    """Gemini CLI sessions (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl).
+
+    Schema (per google-gemini/gemini-cli#15292): a session_metadata record
+    on the first line, then a stream of ``{"type": "user", "content":
+    [{"text": "..."}]}`` and ``{"type": "gemini", "content": [...]}``
+    records, with optional ``message_update`` records carrying token
+    counts only.
+
+    Detection requires a ``session_metadata`` record so this parser does
+    not false-positive against Claude Code or Codex JSONL passed through
+    the dispatch chain. Any ``user``/``gemini`` lines that appear before
+    ``session_metadata`` are discarded — they are treated as preamble
+    noise, not conversational turns. ``message_update`` entries are
+    skipped — they have no message text. Multiple text blocks within a
+    single message's content array are concatenated in order, separated
+    by newlines.
+    """
+    lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
+    messages = []
+    has_session_metadata = False
+    for line in lines:
+        try:
+            entry = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(entry, dict):
+            continue
+
+        entry_type = entry.get("type", "")
+        if entry_type == "session_metadata":
+            has_session_metadata = True
+            continue
+
+        # Discard everything (including user/gemini turns) until the
+        # session_metadata sentinel has been seen.
+        if not has_session_metadata:
+            continue
+
+        if entry_type not in ("user", "gemini"):
+            # Skips message_update, system events, anything else.
+            continue
+
+        content_blocks = entry.get("content", [])
+        if not isinstance(content_blocks, list):
+            continue
+
+        parts = []
+        for block in content_blocks:
+            if not isinstance(block, dict):
+                continue
+            text = block.get("text", "")
+            if isinstance(text, str) and text.strip():
+                parts.append(text)
+        if not parts:
+            continue
+        joined = "\n".join(parts)
+
+        if entry_type == "user":
+            messages.append(("user", joined))
+        else:  # "gemini"
+            messages.append(("assistant", joined))
+
+    if len(messages) >= 2 and has_session_metadata:
+        return _messages_to_transcript(messages)
+    return None
+
+
 def _try_claude_ai_json(data) -> Optional[str]:
     """Claude.ai JSON export: flat messages list or privacy export with chat_messages."""
     if isinstance(data, dict):
diff --git a/tests/test_normalize.py b/tests/test_normalize.py
@@ -11,6 +11,7 @@
     _try_claude_ai_json,
     _try_claude_code_jsonl,
     _try_codex_jsonl,
+    _try_gemini_jsonl,
     _try_normalize_json,
     _try_slack_json,
     normalize,
@@ -450,6 +451,168 @@ def test_codex_jsonl_payload_not_dict():
     assert result is not None
 
 
+# ── _try_gemini_jsonl ──────────────────────────────────────────────────
+#
+# Gemini CLI sessions live at ``~/.gemini/tmp/<project_hash>/chats/`` as
+# JSONL. The schema (per google-gemini/gemini-cli#15292):
+#
+#   {"type":"session_metadata","sessionId":"...","projectHash":"...",...}
+#   {"type":"user","id":"msg1","content":[{"text":"Hello"}]}
+#   {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]}
+#   {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}}
+#
+# Detection requires a ``session_metadata`` record so this parser does
+# not false-positive against Claude Code or Codex JSONL. ``message_update``
+# entries (token-count deltas only) are skipped — they carry no message
+# text. ``content`` is an array of ``{"text": "..."}`` blocks; we join
+# all text blocks for a given message.
+
+
+def test_gemini_jsonl_valid():
+    lines = [
+        json.dumps({"type": "session_metadata", "sessionId": "abc", "projectHash": "h"}),
+        json.dumps({"type": "user", "id": "m1", "content": [{"text": "Hello"}]}),
+        json.dumps({"type": "gemini", "id": "m2", "content": [{"text": "Hi there"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> Hello" in result
+    assert "Hi there" in result
+
+
+def test_gemini_jsonl_multi_turn():
+    lines = [
+        json.dumps({"type": "session_metadata", "sessionId": "s"}),
+        json.dumps({"type": "user", "content": [{"text": "Q1"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A1"}]}),
+        json.dumps({"type": "user", "content": [{"text": "Q2"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A2"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> Q1" in result
+    assert "A1" in result
+    assert "> Q2" in result
+    assert "A2" in result
+
+
+def test_gemini_jsonl_no_session_metadata():
+    """Without session_metadata, parser returns None — guards against false
+    positives on Claude Code / Codex JSONL passed through the dispatch chain."""
+    lines = [
+        json.dumps({"type": "user", "content": [{"text": "Hi"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "Hello"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is None
+
+
+def test_gemini_jsonl_skips_message_update():
+    """message_update records carry only token counts — must be ignored,
+    not turned into empty drawers or duplicated assistant turns."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": [{"text": "Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
+        json.dumps({"type": "message_update", "id": "m2", "tokens": {"input": 10, "output": 5}}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "tokens" not in result
+    assert "input" not in result
+
+
+def test_gemini_jsonl_too_few_messages():
+    """Mirror codex/claude_code behavior: < 2 conversational messages = None."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": [{"text": "only one msg"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is None
+
+
+def test_gemini_jsonl_multi_block_content():
+    """A single message can have multiple text blocks in its content array
+    (e.g. a thinking block + a final answer). Both should be concatenated
+    into one transcript turn, in order."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": [{"text": "Q"}]}),
+        json.dumps(
+            {
+                "type": "gemini",
+                "content": [{"text": "First part."}, {"text": "Second part."}],
+            }
+        ),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "First part." in result
+    assert "Second part." in result
+
+
+def test_gemini_jsonl_empty_content_skipped():
+    """A message whose content array yields no text should be skipped, not
+    emit an empty turn that would corrupt the transcript."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        json.dumps({"type": "user", "content": []}),
+        json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> real Q" in result
+    assert "real A" in result
+
+
+def test_gemini_jsonl_invalid_json_lines_skipped():
+    """A malformed line in the middle of the stream must not abort parsing —
+    the rest of the session should still produce a transcript."""
+    lines = [
+        json.dumps({"type": "session_metadata"}),
+        "not-valid-json{",
+        json.dumps({"type": "user", "content": [{"text": "Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "> Q" in result
+
+
+def test_gemini_jsonl_does_not_match_codex():
+    """Codex JSONL passed in must NOT be parsed by the gemini adapter — the
+    dispatch chain in _try_normalize_json relies on each adapter returning
+    None when it doesn't recognize a format."""
+    lines = [
+        json.dumps({"type": "session_meta", "payload": {}}),
+        json.dumps({"type": "event_msg", "payload": {"type": "user_message", "message": "Q"}}),
+        json.dumps({"type": "event_msg", "payload": {"type": "agent_message", "message": "A"}}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is None
+
+
+def test_gemini_jsonl_messages_before_session_metadata_discarded():
+    """user/gemini turns that appear before the session_metadata sentinel must
+    be silently discarded, not counted as conversational messages.  Only turns
+    after the sentinel contribute to the transcript."""
+    lines = [
+        json.dumps({"type": "user", "content": [{"text": "preamble Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "preamble A"}]}),
+        json.dumps({"type": "session_metadata", "sessionId": "s"}),
+        json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
+        json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
+    ]
+    result = _try_gemini_jsonl("\n".join(lines))
+    assert result is not None
+    assert "preamble Q" not in result
+    assert "preamble A" not in result
+    assert "> real Q" in result
+    assert "real A" in result
+
+
 # ── _try_claude_ai_json ───────────────────────────────────────────────