Skip to content

Commit fdfaf01

Browse files
authored
Merge pull request #1234 from MemPalace/feat/normalize-gemini-cli
feat(normalize): Gemini CLI session JSONL adapter
2 parents 4ffd0bd + e7fe6ca commit fdfaf01

2 files changed

Lines changed: 236 additions & 0 deletions

File tree

mempalace/normalize.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
- ChatGPT conversations.json
99
- Claude Code JSONL (with tool_use/tool_result block capture)
1010
- OpenAI Codex CLI JSONL
11+
- Gemini CLI JSONL (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl)
1112
- Slack JSON export
1213
- Plain text (pass through for paragraph chunking)
1314
@@ -157,6 +158,10 @@ def _try_normalize_json(content: str) -> Optional[str]:
157158
if normalized:
158159
return normalized
159160

161+
normalized = _try_gemini_jsonl(content)
162+
if normalized:
163+
return normalized
164+
160165
try:
161166
data = json.loads(content)
162167
except json.JSONDecodeError:
@@ -280,6 +285,74 @@ def _try_codex_jsonl(content: str) -> Optional[str]:
280285
return None
281286

282287

288+
def _try_gemini_jsonl(content: str) -> Optional[str]:
289+
"""Gemini CLI sessions (~/.gemini/tmp/<project_hash>/chats/session-*.jsonl).
290+
291+
Schema (per google-gemini/gemini-cli#15292): a session_metadata record
292+
on the first line, then a stream of ``{"type": "user", "content":
293+
[{"text": "..."}]}`` and ``{"type": "gemini", "content": [...]}``
294+
records, with optional ``message_update`` records carrying token
295+
counts only.
296+
297+
Detection requires a ``session_metadata`` record so this parser does
298+
not false-positive against Claude Code or Codex JSONL passed through
299+
the dispatch chain. Any ``user``/``gemini`` lines that appear before
300+
``session_metadata`` are discarded — they are treated as preamble
301+
noise, not conversational turns. ``message_update`` entries are
302+
skipped — they have no message text. Multiple text blocks within a
303+
single message's content array are concatenated in order, separated
304+
by newlines.
305+
"""
306+
lines = [line.strip() for line in content.strip().split("\n") if line.strip()]
307+
messages = []
308+
has_session_metadata = False
309+
for line in lines:
310+
try:
311+
entry = json.loads(line)
312+
except json.JSONDecodeError:
313+
continue
314+
if not isinstance(entry, dict):
315+
continue
316+
317+
entry_type = entry.get("type", "")
318+
if entry_type == "session_metadata":
319+
has_session_metadata = True
320+
continue
321+
322+
# Discard everything (including user/gemini turns) until the
323+
# session_metadata sentinel has been seen.
324+
if not has_session_metadata:
325+
continue
326+
327+
if entry_type not in ("user", "gemini"):
328+
# Skips message_update, system events, anything else.
329+
continue
330+
331+
content_blocks = entry.get("content", [])
332+
if not isinstance(content_blocks, list):
333+
continue
334+
335+
parts = []
336+
for block in content_blocks:
337+
if not isinstance(block, dict):
338+
continue
339+
text = block.get("text", "")
340+
if isinstance(text, str) and text.strip():
341+
parts.append(text)
342+
if not parts:
343+
continue
344+
joined = "\n".join(parts)
345+
346+
if entry_type == "user":
347+
messages.append(("user", joined))
348+
else: # "gemini"
349+
messages.append(("assistant", joined))
350+
351+
if len(messages) >= 2 and has_session_metadata:
352+
return _messages_to_transcript(messages)
353+
return None
354+
355+
283356
def _try_claude_ai_json(data) -> Optional[str]:
284357
"""Claude.ai JSON export: flat messages list or privacy export with chat_messages."""
285358
if isinstance(data, dict):

tests/test_normalize.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
_try_claude_ai_json,
1212
_try_claude_code_jsonl,
1313
_try_codex_jsonl,
14+
_try_gemini_jsonl,
1415
_try_normalize_json,
1516
_try_slack_json,
1617
normalize,
@@ -450,6 +451,168 @@ def test_codex_jsonl_payload_not_dict():
450451
assert result is not None
451452

452453

454+
# ── _try_gemini_jsonl ──────────────────────────────────────────────────
455+
#
456+
# Gemini CLI sessions live at ``~/.gemini/tmp/<project_hash>/chats/`` as
457+
# JSONL. The schema (per google-gemini/gemini-cli#15292):
458+
#
459+
# {"type":"session_metadata","sessionId":"...","projectHash":"...",...}
460+
# {"type":"user","id":"msg1","content":[{"text":"Hello"}]}
461+
# {"type":"gemini","id":"msg2","content":[{"text":"Hi"}]}
462+
# {"type":"message_update","id":"msg2","tokens":{"input":10,"output":5}}
463+
#
464+
# Detection requires a ``session_metadata`` record so this parser does
465+
# not false-positive against Claude Code or Codex JSONL. ``message_update``
466+
# entries (token-count deltas only) are skipped — they carry no message
467+
# text. ``content`` is an array of ``{"text": "..."}`` blocks; we join
468+
# all text blocks for a given message.
469+
470+
471+
def test_gemini_jsonl_valid():
472+
lines = [
473+
json.dumps({"type": "session_metadata", "sessionId": "abc", "projectHash": "h"}),
474+
json.dumps({"type": "user", "id": "m1", "content": [{"text": "Hello"}]}),
475+
json.dumps({"type": "gemini", "id": "m2", "content": [{"text": "Hi there"}]}),
476+
]
477+
result = _try_gemini_jsonl("\n".join(lines))
478+
assert result is not None
479+
assert "> Hello" in result
480+
assert "Hi there" in result
481+
482+
483+
def test_gemini_jsonl_multi_turn():
484+
lines = [
485+
json.dumps({"type": "session_metadata", "sessionId": "s"}),
486+
json.dumps({"type": "user", "content": [{"text": "Q1"}]}),
487+
json.dumps({"type": "gemini", "content": [{"text": "A1"}]}),
488+
json.dumps({"type": "user", "content": [{"text": "Q2"}]}),
489+
json.dumps({"type": "gemini", "content": [{"text": "A2"}]}),
490+
]
491+
result = _try_gemini_jsonl("\n".join(lines))
492+
assert result is not None
493+
assert "> Q1" in result
494+
assert "A1" in result
495+
assert "> Q2" in result
496+
assert "A2" in result
497+
498+
499+
def test_gemini_jsonl_no_session_metadata():
500+
"""Without session_metadata, parser returns None — guards against false
501+
positives on Claude Code / Codex JSONL passed through the dispatch chain."""
502+
lines = [
503+
json.dumps({"type": "user", "content": [{"text": "Hi"}]}),
504+
json.dumps({"type": "gemini", "content": [{"text": "Hello"}]}),
505+
]
506+
result = _try_gemini_jsonl("\n".join(lines))
507+
assert result is None
508+
509+
510+
def test_gemini_jsonl_skips_message_update():
511+
"""message_update records carry only token counts — must be ignored,
512+
not turned into empty drawers or duplicated assistant turns."""
513+
lines = [
514+
json.dumps({"type": "session_metadata"}),
515+
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
516+
json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
517+
json.dumps({"type": "message_update", "id": "m2", "tokens": {"input": 10, "output": 5}}),
518+
]
519+
result = _try_gemini_jsonl("\n".join(lines))
520+
assert result is not None
521+
assert "tokens" not in result
522+
assert "input" not in result
523+
524+
525+
def test_gemini_jsonl_too_few_messages():
526+
"""Mirror codex/claude_code behavior: < 2 conversational messages = None."""
527+
lines = [
528+
json.dumps({"type": "session_metadata"}),
529+
json.dumps({"type": "user", "content": [{"text": "only one msg"}]}),
530+
]
531+
result = _try_gemini_jsonl("\n".join(lines))
532+
assert result is None
533+
534+
535+
def test_gemini_jsonl_multi_block_content():
536+
"""A single message can have multiple text blocks in its content array
537+
(e.g. a thinking block + a final answer). Both should be concatenated
538+
into one transcript turn, in order."""
539+
lines = [
540+
json.dumps({"type": "session_metadata"}),
541+
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
542+
json.dumps(
543+
{
544+
"type": "gemini",
545+
"content": [{"text": "First part."}, {"text": "Second part."}],
546+
}
547+
),
548+
]
549+
result = _try_gemini_jsonl("\n".join(lines))
550+
assert result is not None
551+
assert "First part." in result
552+
assert "Second part." in result
553+
554+
555+
def test_gemini_jsonl_empty_content_skipped():
556+
"""A message whose content array yields no text should be skipped, not
557+
emit an empty turn that would corrupt the transcript."""
558+
lines = [
559+
json.dumps({"type": "session_metadata"}),
560+
json.dumps({"type": "user", "content": []}),
561+
json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
562+
json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
563+
]
564+
result = _try_gemini_jsonl("\n".join(lines))
565+
assert result is not None
566+
assert "> real Q" in result
567+
assert "real A" in result
568+
569+
570+
def test_gemini_jsonl_invalid_json_lines_skipped():
571+
"""A malformed line in the middle of the stream must not abort parsing —
572+
the rest of the session should still produce a transcript."""
573+
lines = [
574+
json.dumps({"type": "session_metadata"}),
575+
"not-valid-json{",
576+
json.dumps({"type": "user", "content": [{"text": "Q"}]}),
577+
json.dumps({"type": "gemini", "content": [{"text": "A"}]}),
578+
]
579+
result = _try_gemini_jsonl("\n".join(lines))
580+
assert result is not None
581+
assert "> Q" in result
582+
583+
584+
def test_gemini_jsonl_does_not_match_codex():
585+
"""Codex JSONL passed in must NOT be parsed by the gemini adapter — the
586+
dispatch chain in _try_normalize_json relies on each adapter returning
587+
None when it doesn't recognize a format."""
588+
lines = [
589+
json.dumps({"type": "session_meta", "payload": {}}),
590+
json.dumps({"type": "event_msg", "payload": {"type": "user_message", "message": "Q"}}),
591+
json.dumps({"type": "event_msg", "payload": {"type": "agent_message", "message": "A"}}),
592+
]
593+
result = _try_gemini_jsonl("\n".join(lines))
594+
assert result is None
595+
596+
597+
def test_gemini_jsonl_messages_before_session_metadata_discarded():
598+
"""user/gemini turns that appear before the session_metadata sentinel must
599+
be silently discarded, not counted as conversational messages. Only turns
600+
after the sentinel contribute to the transcript."""
601+
lines = [
602+
json.dumps({"type": "user", "content": [{"text": "preamble Q"}]}),
603+
json.dumps({"type": "gemini", "content": [{"text": "preamble A"}]}),
604+
json.dumps({"type": "session_metadata", "sessionId": "s"}),
605+
json.dumps({"type": "user", "content": [{"text": "real Q"}]}),
606+
json.dumps({"type": "gemini", "content": [{"text": "real A"}]}),
607+
]
608+
result = _try_gemini_jsonl("\n".join(lines))
609+
assert result is not None
610+
assert "preamble Q" not in result
611+
assert "preamble A" not in result
612+
assert "> real Q" in result
613+
assert "real A" in result
614+
615+
453616
# ── _try_claude_ai_json ───────────────────────────────────────────────
454617

455618

0 commit comments

Comments
 (0)