Skip to content

Commit 0d80be6

Browse files
committed
fix: apply ingest protections during bootstrap import
1 parent 8149b76 commit 0d80be6

2 files changed

Lines changed: 238 additions & 43 deletions

File tree

src/engine.ts

Lines changed: 59 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3768,6 +3768,7 @@ export class LcmContextEngine implements ContextEngine {
37683768
sessionKey?: string;
37693769
conversationId: number;
37703770
historicalMessages: AgentMessage[];
3771+
checkpointEntryHash?: string | null;
37713772
}): Promise<{
37723773
blockedByImportCap: boolean;
37733774
importedMessages: number;
@@ -3868,10 +3869,24 @@ export class LcmContextEngine implements ContextEngine {
38683869
}
38693870

38703871
if (anchorIndex < 0) {
3871-
this.deps.log.info(
3872-
`[lcm] reconcileSessionTail: no anchor for ${sessionContext} duration=${formatDurationMs(Date.now() - startedAt)} historicalMessages=${historicalMessages.length} importedMessages=0 overlap=false`,
3873-
);
3874-
return { blockedByImportCap: false, importedMessages: 0, hasOverlap: false };
3872+
const checkpointEntryHash = params.checkpointEntryHash;
3873+
if (checkpointEntryHash) {
3874+
// Externalized bootstrap rows no longer match raw JSONL content, so
3875+
// fall back to the raw transcript checkpoint before declaring no overlap.
3876+
for (let index = storedHistoricalMessages.length - 1; index >= 0; index--) {
3877+
if (createBootstrapEntryHash(storedHistoricalMessages[index]) === checkpointEntryHash) {
3878+
anchorIndex = index;
3879+
break;
3880+
}
3881+
}
3882+
}
3883+
3884+
if (anchorIndex < 0) {
3885+
this.deps.log.info(
3886+
`[lcm] reconcileSessionTail: no anchor for ${sessionContext} duration=${formatDurationMs(Date.now() - startedAt)} historicalMessages=${historicalMessages.length} importedMessages=0 overlap=false`,
3887+
);
3888+
return { blockedByImportCap: false, importedMessages: 0, hasOverlap: false };
3889+
}
38753890
}
38763891
if (anchorIndex >= historicalMessages.length - 1) {
38773892
this.deps.log.info(
@@ -3910,15 +3925,15 @@ export class LcmContextEngine implements ContextEngine {
39103925
/**
39113926
* Persist bootstrap checkpoint metadata anchored to the current DB frontier.
39123927
*
3913-
* We intentionally checkpoint the session file's current EOF while hashing the
3914-
* latest persisted DB message. This keeps append-only recovery aligned with the
3915-
* canonical LCM frontier even when trailing transcript entries are pruned or
3916-
* otherwise noncanonical.
3928+
* By default, the frontier hash follows the latest persisted DB message. The
3929+
* first-time bootstrap path can override it with the raw transcript hash so
3930+
* later reconciliation can anchor entries whose DB content was externalized.
39173931
*/
39183932
private async refreshBootstrapState(params: {
39193933
conversationId: number;
39203934
sessionFile: string;
39213935
fileStats?: { size: number; mtimeMs: number };
3936+
lastProcessedEntryHash?: string | null;
39223937
}): Promise<void> {
39233938
const latestDbMessage = await this.conversationStore.getLastMessage(params.conversationId);
39243939
const fileStats = params.fileStats ?? (await stat(params.sessionFile));
@@ -3928,13 +3943,16 @@ export class LcmContextEngine implements ContextEngine {
39283943
lastSeenSize: fileStats.size,
39293944
lastSeenMtimeMs: Math.trunc(fileStats.mtimeMs),
39303945
lastProcessedOffset: fileStats.size,
3931-
lastProcessedEntryHash: latestDbMessage
3932-
? createBootstrapEntryHash({
3933-
role: latestDbMessage.role,
3934-
content: latestDbMessage.content,
3935-
tokenCount: latestDbMessage.tokenCount,
3936-
})
3937-
: null,
3946+
lastProcessedEntryHash:
3947+
params.lastProcessedEntryHash !== undefined
3948+
? params.lastProcessedEntryHash
3949+
: latestDbMessage
3950+
? createBootstrapEntryHash({
3951+
role: latestDbMessage.role,
3952+
content: latestDbMessage.content,
3953+
tokenCount: latestDbMessage.tokenCount,
3954+
})
3955+
: null,
39383956
});
39393957
}
39403958

@@ -3973,6 +3991,7 @@ export class LcmContextEngine implements ContextEngine {
39733991
this.conversationStore.withTransaction(async () => {
39743992
const persistBootstrapState = async (
39753993
conversationId: number,
3994+
lastProcessedEntryHash?: string | null,
39763995
): Promise<void> => {
39773996
await this.refreshBootstrapState({
39783997
conversationId,
@@ -3981,6 +4000,7 @@ export class LcmContextEngine implements ContextEngine {
39814000
size: sessionFileSize,
39824001
mtimeMs: sessionFileMtimeMs,
39834002
},
4003+
lastProcessedEntryHash,
39844004
});
39854005
// Update the file-level cache so subsequent bootstraps against an
39864006
// unchanged file can skip the full read via the cache guard.
@@ -4215,28 +4235,24 @@ export class LcmContextEngine implements ContextEngine {
42154235
};
42164236
}
42174237

4218-
const nextSeq = (await this.conversationStore.getMaxSeq(conversationId)) + 1;
4219-
const bulkInput = bootstrapMessages.map((message, index) => {
4220-
const stored = toStoredMessage(message);
4221-
return {
4222-
conversationId,
4223-
seq: nextSeq + index,
4224-
role: stored.role,
4225-
content: stored.content,
4226-
tokenCount: stored.tokenCount,
4227-
};
4228-
});
4229-
4230-
const inserted = await this.conversationStore.createMessagesBulk(bulkInput);
4231-
await this.summaryStore.appendContextMessages(
4232-
conversationId,
4233-
inserted.map((record) => record.messageId),
4234-
);
4238+
let importedMessages = 0;
4239+
for (const message of bootstrapMessages) {
4240+
const result = await this.ingestSingle({
4241+
sessionId: params.sessionId,
4242+
sessionKey: params.sessionKey,
4243+
message,
4244+
});
4245+
if (result.ingested) {
4246+
importedMessages += 1;
4247+
}
4248+
}
42354249
await this.conversationStore.markConversationBootstrapped(conversationId);
42364250

42374251
// Prune HEARTBEAT_OK turns from the freshly imported data
4252+
let prunedMessages = 0;
42384253
if (this.config.pruneHeartbeatOk) {
42394254
const pruned = await this.pruneHeartbeatOkTurns(conversationId);
4255+
prunedMessages = pruned;
42404256
if (pruned > 0) {
42414257
this.clearStableOrphanStrippingOrdinal(conversationId);
42424258
this.deps.log.info(
@@ -4245,17 +4261,23 @@ export class LcmContextEngine implements ContextEngine {
42454261
}
42464262
}
42474263

4248-
await persistBootstrapState(conversationId);
4249-
if (inserted.length > 0) {
4264+
const lastImportedHash =
4265+
prunedMessages === 0 && bootstrapMessages.length > 0
4266+
? createBootstrapEntryHash(
4267+
toStoredMessage(bootstrapMessages[bootstrapMessages.length - 1]),
4268+
)
4269+
: undefined;
4270+
await persistBootstrapState(conversationId, lastImportedHash);
4271+
if (importedMessages > 0) {
42504272
this.clearStableOrphanStrippingOrdinal(conversationId);
42514273
}
42524274
this.deps.log.info(
4253-
`[lcm] bootstrap: initial import conversation=${conversationId} ${sessionLabel} importedMessages=${inserted.length} sourceMessages=${historicalMessages.length} duration=${formatDurationMs(Date.now() - startedAt)}`,
4275+
`[lcm] bootstrap: initial import conversation=${conversationId} ${sessionLabel} importedMessages=${importedMessages} sourceMessages=${historicalMessages.length} duration=${formatDurationMs(Date.now() - startedAt)}`,
42544276
);
42554277

42564278
return {
42574279
bootstrapped: true,
4258-
importedMessages: inserted.length,
4280+
importedMessages,
42594281
};
42604282
}
42614283

@@ -4266,6 +4288,7 @@ export class LcmContextEngine implements ContextEngine {
42664288
sessionKey: params.sessionKey,
42674289
conversationId,
42684290
historicalMessages,
4291+
checkpointEntryHash: bootstrapState?.lastProcessedEntryHash,
42694292
});
42704293
this.deps.log.info(
42714294
`[lcm] bootstrap: reconcile finished conversation=${conversationId} ${sessionLabel} importedMessages=${reconcile.importedMessages} overlap=${reconcile.hasOverlap} blockedByImportCap=${reconcile.blockedByImportCap} duration=${formatDurationMs(Date.now() - startedAt)}`,

test/engine.test.ts

Lines changed: 179 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3728,16 +3728,16 @@ describe("LcmContextEngine.bootstrap", () => {
37283728
expect(reconcileSpy).toHaveBeenCalledTimes(1);
37293729
});
37303730

3731-
it("uses the bulk import path for initial bootstrap", async () => {
3732-
const sessionFile = createSessionFilePath("bulk");
3731+
it("uses the live ingest path for initial bootstrap", async () => {
3732+
const sessionFile = createSessionFilePath("bootstrap-ingest-path");
37333733
const sm = SessionManager.open(sessionFile);
37343734
sm.appendMessage({
37353735
role: "user",
3736-
content: [{ type: "text", text: "bulk one" }],
3736+
content: [{ type: "text", text: "ingest one" }],
37373737
} as AgentMessage);
37383738
sm.appendMessage({
37393739
role: "assistant",
3740-
content: [{ type: "text", text: "bulk two" }],
3740+
content: [{ type: "text", text: "ingest two" }],
37413741
} as AgentMessage);
37423742

37433743
const warnLog = vi.fn();
@@ -3753,13 +3753,185 @@ describe("LcmContextEngine.bootstrap", () => {
37533753
const singleSpy = vi.spyOn(engine.getConversationStore(), "createMessage");
37543754

37553755
const result = await engine.bootstrap({
3756-
sessionId: "bootstrap-bulk",
3756+
sessionId: "bootstrap-ingest-path",
37573757
sessionFile,
37583758
});
37593759

37603760
expect(result.bootstrapped).toBe(true);
3761-
expect(bulkSpy).toHaveBeenCalledTimes(1);
3762-
expect(singleSpy).not.toHaveBeenCalled();
3761+
expect(result.importedMessages).toBe(2);
3762+
expect(bulkSpy).not.toHaveBeenCalled();
3763+
expect(singleSpy).toHaveBeenCalledTimes(2);
3764+
});
3765+
3766+
it("externalizes oversized file blocks during first-time bootstrap and still reconciles later tail messages", async () => {
3767+
await withTempHome(async () => {
3768+
const sessionFile = createSessionFilePath("bootstrap-large-file-parity");
3769+
const fileText = `${"bootstrap file line\n".repeat(160)}done`;
3770+
writeFileSync(
3771+
sessionFile,
3772+
`${JSON.stringify({
3773+
role: "user",
3774+
content: `<file name="bootstrap.md" mime="text/markdown">${fileText}</file>`,
3775+
})}\n`,
3776+
"utf8",
3777+
);
3778+
3779+
const engine = createEngineWithConfig({ largeFileTokenThreshold: 20 });
3780+
const sessionId = "bootstrap-large-file-parity";
3781+
const first = await engine.bootstrap({ sessionId, sessionFile });
3782+
expect(first).toEqual({
3783+
bootstrapped: true,
3784+
importedMessages: 1,
3785+
});
3786+
3787+
const conversation = await engine.getConversationStore().getConversationBySessionId(sessionId);
3788+
expect(conversation).not.toBeNull();
3789+
3790+
const initiallyStored = await engine
3791+
.getConversationStore()
3792+
.getMessages(conversation!.conversationId);
3793+
expect(initiallyStored).toHaveLength(1);
3794+
expect(initiallyStored[0].content).toContain("[LCM File: file_");
3795+
expect(initiallyStored[0].content).not.toContain("<file name=");
3796+
expect(initiallyStored[0].content).not.toContain(fileText.slice(0, 64));
3797+
3798+
const fileIdMatch = initiallyStored[0].content.match(/file_[a-f0-9]{16}/);
3799+
expect(fileIdMatch).not.toBeNull();
3800+
const storedFile = await engine.getSummaryStore().getLargeFile(fileIdMatch![0]);
3801+
expect(storedFile).not.toBeNull();
3802+
expect(storedFile!.fileName).toBe("bootstrap.md");
3803+
expect(readFileSync(storedFile!.storageUri, "utf8")).toBe(fileText);
3804+
3805+
const parts = await engine.getConversationStore().getMessageParts(initiallyStored[0].messageId);
3806+
expect(parts).toHaveLength(1);
3807+
expect(parts[0].textContent).toContain("[LCM File: file_");
3808+
3809+
appendFileSync(
3810+
sessionFile,
3811+
`${JSON.stringify({
3812+
role: "assistant",
3813+
content: [{ type: "text", text: "tail after externalized bootstrap" }],
3814+
})}\n`,
3815+
"utf8",
3816+
);
3817+
3818+
const second = await engine.bootstrap({ sessionId, sessionFile });
3819+
expect(second).toEqual({
3820+
bootstrapped: true,
3821+
importedMessages: 1,
3822+
reason: "reconciled missing session messages",
3823+
});
3824+
3825+
const afterReconcile = await engine
3826+
.getConversationStore()
3827+
.getMessages(conversation!.conversationId);
3828+
expect(afterReconcile.map((message) => message.content)).toEqual([
3829+
initiallyStored[0].content,
3830+
"tail after externalized bootstrap",
3831+
]);
3832+
});
3833+
});
3834+
3835+
it("externalizes inline images during first-time bootstrap", async () => {
3836+
const largeFilesDir = mkdtempSync(join(tmpdir(), "lossless-claw-large-files-"));
3837+
tempDirs.push(largeFilesDir);
3838+
const sessionFile = createSessionFilePath("bootstrap-inline-image-parity");
3839+
const base64Image = `iVBOR${"A".repeat(600)}`;
3840+
writeFileSync(
3841+
sessionFile,
3842+
`${JSON.stringify({
3843+
role: "user",
3844+
content: `[media attached: bootstrap.png]\n${base64Image}\n`,
3845+
})}\n`,
3846+
"utf8",
3847+
);
3848+
3849+
const engine = createEngineWithConfig({
3850+
largeFileTokenThreshold: 20,
3851+
largeFilesDir,
3852+
});
3853+
const sessionId = "bootstrap-inline-image-parity";
3854+
const result = await engine.bootstrap({ sessionId, sessionFile });
3855+
expect(result.bootstrapped).toBe(true);
3856+
expect(result.importedMessages).toBe(1);
3857+
3858+
const conversation = await engine.getConversationStore().getConversationBySessionId(sessionId);
3859+
expect(conversation).not.toBeNull();
3860+
const messages = await engine.getConversationStore().getMessages(conversation!.conversationId);
3861+
expect(messages).toHaveLength(1);
3862+
expect(messages[0].content).toContain("[User image: bootstrap.png");
3863+
expect(messages[0].content).not.toContain(base64Image.slice(0, 32));
3864+
3865+
const fileIdMatch = messages[0].content.match(/file_[a-f0-9]{16}/);
3866+
expect(fileIdMatch).not.toBeNull();
3867+
const storedFile = await engine.getSummaryStore().getLargeFile(fileIdMatch![0]);
3868+
expect(storedFile).not.toBeNull();
3869+
expect(storedFile!.mimeType).toBe("image/png");
3870+
expect(storedFile!.storageUri).toContain(`${largeFilesDir}/${conversation!.conversationId}/`);
3871+
});
3872+
3873+
it("externalizes oversized tool results during first-time bootstrap", async () => {
3874+
await withTempHome(async () => {
3875+
const sessionFile = createSessionFilePath("bootstrap-tool-result-parity");
3876+
const sm = SessionManager.open(sessionFile);
3877+
const toolOutput = `${"bootstrap tool output\n".repeat(160)}done`;
3878+
sm.appendMessage({
3879+
role: "assistant",
3880+
content: [
3881+
{
3882+
type: "toolCall",
3883+
id: "call_bootstrap_externalized",
3884+
name: "exec",
3885+
input: { cmd: "cat large.txt" },
3886+
},
3887+
],
3888+
} as AgentMessage);
3889+
sm.appendMessage({
3890+
role: "toolResult",
3891+
toolCallId: "call_bootstrap_externalized",
3892+
toolName: "exec",
3893+
content: [
3894+
{
3895+
type: "tool_result",
3896+
tool_use_id: "call_bootstrap_externalized",
3897+
name: "exec",
3898+
content: [{ type: "text", text: toolOutput }],
3899+
},
3900+
],
3901+
} as AgentMessage);
3902+
3903+
const engine = createEngineWithConfig({ largeFileTokenThreshold: 20 });
3904+
const sessionId = "bootstrap-tool-result-parity";
3905+
const result = await engine.bootstrap({ sessionId, sessionFile });
3906+
expect(result.bootstrapped).toBe(true);
3907+
expect(result.importedMessages).toBe(2);
3908+
3909+
const conversation = await engine.getConversationStore().getConversationBySessionId(sessionId);
3910+
expect(conversation).not.toBeNull();
3911+
const messages = await engine.getConversationStore().getMessages(conversation!.conversationId);
3912+
expect(messages).toHaveLength(2);
3913+
expect(messages[1].content).toContain("[LCM Tool Output: file_");
3914+
expect(messages[1].content).toContain("tool=exec");
3915+
expect(messages[1].content).not.toContain(toolOutput.slice(0, 64));
3916+
3917+
const fileIdMatch = messages[1].content.match(/file_[a-f0-9]{16}/);
3918+
expect(fileIdMatch).not.toBeNull();
3919+
const fileId = fileIdMatch![0];
3920+
const storedFile = await engine.getSummaryStore().getLargeFile(fileId);
3921+
expect(storedFile).not.toBeNull();
3922+
expect(storedFile!.fileName).toBe("exec.txt");
3923+
expect(readFileSync(storedFile!.storageUri, "utf8")).toBe(toolOutput);
3924+
3925+
const parts = await engine.getConversationStore().getMessageParts(messages[1].messageId);
3926+
expect(parts).toHaveLength(1);
3927+
const metadata = JSON.parse(parts[0].metadata ?? "{}") as Record<string, unknown>;
3928+
expect(metadata).toMatchObject({
3929+
externalizedFileId: fileId,
3930+
originalByteSize: Buffer.byteLength(toolOutput, "utf8"),
3931+
toolOutputExternalized: true,
3932+
externalizationReason: "large_tool_result",
3933+
});
3934+
});
37633935
});
37643936

37653937
it("limits first-time bootstrap imports to the newest messages within bootstrapMaxTokens", async () => {

0 commit comments

Comments
 (0)