Skip to content

Commit 386a40f

Browse files
committed
feat: add HuggingFace tokenizer integration for precise token counting
- Add HuggingFace tokenizer integration for accurate token counting - Sync tokenizer API and load from cache in constructor - Pass useTokenizer and tokenizer to estimateSessionTokenCountForAfterTurn - Add comprehensive tokenizer integration tests
1 parent 4684f2d commit 386a40f

25 files changed

Lines changed: 1443628 additions & 210 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ dist/
88
tui/lcm-tui
99
dist/
1010
tui/tui
11+
pnpm-lock.yaml

index.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ import { createLcmDescribeTool } from "./src/tools/lcm-describe-tool.js";
1313
import { createLcmExpandQueryTool } from "./src/tools/lcm-expand-query-tool.js";
1414
import { createLcmExpandTool } from "./src/tools/lcm-expand-tool.js";
1515
import { createLcmGrepTool } from "./src/tools/lcm-grep-tool.js";
16-
import type { LcmDependencies } from "./src/types.js";
16+
import type { LcmDependencies, TokenizerService } from "./src/types.js";
17+
import { HuggingFaceTokenizer } from "./src/tokenizers/huggingface.js";
1718

1819
/** Parse `agent:<agentId>:<suffix...>` session keys. */
1920
function parseAgentSessionKey(sessionKey: string): { agentId: string; suffix: string } | null {
@@ -1262,6 +1263,13 @@ function createLcmDependencies(api: OpenClawPluginApi): LcmDependencies {
12621263
error: (msg) => api.logger.error(msg),
12631264
debug: (msg) => api.logger.debug?.(msg),
12641265
},
1266+
tokenizer: config.useTokenizer
1267+
? (() => {
1268+
const t = new HuggingFaceTokenizer(envSnapshot.openclawDefaultModel || "glm-5", config.proxy);
1269+
api.logger.info(`[lcm] Tokenizer created (model=${envSnapshot.openclawDefaultModel || "glm-5"}, proxy=${config.proxy || "none"})`);
1270+
return t;
1271+
})()
1272+
: undefined,
12651273
};
12661274
}
12671275

@@ -1317,7 +1325,7 @@ const lcmPlugin = {
13171325
);
13181326

13191327
api.logger.info(
1320-
`[lcm] Plugin loaded (enabled=${deps.config.enabled}, db=${deps.config.databasePath}, threshold=${deps.config.contextThreshold})`,
1328+
`[lcm] Plugin loaded (enabled=${deps.config.enabled}, db=${deps.config.databasePath}, threshold=${deps.config.contextThreshold}, useTokenizer=${deps.config.useTokenizer}${deps.config.proxy ? `, proxy=${deps.config.proxy}` : ""})`,
13211329
);
13221330
},
13231331
};

openclaw.plugin.json

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,27 +24,39 @@
2424
"summaryProvider": {
2525
"label": "Summary Provider",
2626
"help": "Provider override for LCM summarization (e.g., 'openai-resp')"
27+
},
28+
"useTokenizer": {
29+
"label": "Use Precise Tokenizer",
30+
"help": "Use HuggingFace tokenizer service instead of chars/4 heuristic"
31+
},
32+
"proxy": {
33+
"label": "Proxy URL",
34+
"help": "HTTP(S) proxy for tokenizer downloads from HuggingFace"
2735
}
2836
},
2937
"configSchema": {
3038
"type": "object",
3139
"additionalProperties": false,
3240
"properties": {
3341
"enabled": {
34-
"type": "boolean"
42+
"type": "boolean",
43+
"description": "Enable or disable the plugin"
3544
},
3645
"contextThreshold": {
3746
"type": "number",
3847
"minimum": 0,
39-
"maximum": 1
48+
"maximum": 1,
49+
"description": "Fraction of context window that triggers compaction (0.0–1.0)"
4050
},
4151
"incrementalMaxDepth": {
4252
"type": "integer",
43-
"minimum": -1
53+
"minimum": -1,
54+
"description": "How deep incremental compaction goes (0 = leaf only, -1 = unlimited)"
4455
},
4556
"freshTailCount": {
4657
"type": "integer",
47-
"minimum": 1
58+
"minimum": 1,
59+
"description": "Number of recent messages protected from compaction"
4860
},
4961
"leafMinFanout": {
5062
"type": "integer",
@@ -59,17 +71,47 @@
5971
"minimum": 2
6072
},
6173
"dbPath": {
62-
"type": "string"
74+
"type": "string",
75+
"description": "Path to LCM SQLite database (default: ~/.openclaw/lcm.db)"
6376
},
6477
"largeFileThresholdTokens": {
6578
"type": "integer",
66-
"minimum": 1000
79+
"minimum": 1000,
80+
"description": "Token threshold for treating files as 'large'"
6781
},
6882
"summaryModel": {
6983
"type": "string"
7084
},
7185
"summaryProvider": {
7286
"type": "string"
87+
},
88+
"useTokenizer": {
89+
"type": "boolean",
90+
"description": "Use precise tokenizer service instead of chars/4 heuristic"
91+
},
92+
"proxy": {
93+
"type": "string",
94+
"description": "HTTP(S) proxy URL for tokenizer downloads from HuggingFace"
95+
},
96+
"timezone": {
97+
"type": "string",
98+
"description": "IANA timezone for timestamps in summaries"
99+
},
100+
"pruneHeartbeatOk": {
101+
"type": "boolean",
102+
"description": "Delete HEARTBEAT_OK turn cycles from LCM storage"
103+
},
104+
"autocompactDisabled": {
105+
"type": "boolean",
106+
"description": "Disable automatic compaction"
107+
},
108+
"largeFileSummaryProvider": {
109+
"type": "string",
110+
"description": "Provider override for large-file summarization"
111+
},
112+
"largeFileSummaryModel": {
113+
"type": "string",
114+
"description": "Model override for large-file summarization"
73115
}
74116
}
75117
}

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,11 @@
3030
"version-packages": "changeset version"
3131
},
3232
"dependencies": {
33+
"@huggingface/tokenizers": "^0.1.2",
3334
"@mariozechner/pi-agent-core": "*",
3435
"@mariozechner/pi-ai": "*",
35-
"@sinclair/typebox": "0.34.48"
36+
"@sinclair/typebox": "0.34.48",
37+
"undici": "^7.22.0"
3638
},
3739
"devDependencies": {
3840
"@changesets/cli": "^2.30.0",

scripts/generate-manifest.ts

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/**
2+
* Generate openclaw.plugin.json configSchema from LcmConfig type.
3+
* Run: npx tsx scripts/generate-manifest.ts
4+
*/
5+
import { readFileSync, writeFileSync } from "node:fs";
6+
import { join } from "node:path";
7+
8+
// Config schema definitions - single source of truth
9+
// Keep in sync with src/db/config.ts LcmConfig type
10+
const CONFIG_SCHEMA_PROPERTIES: Record<string, unknown> = {
11+
enabled: {
12+
type: "boolean",
13+
description: "Enable or disable the plugin",
14+
},
15+
contextThreshold: {
16+
type: "number",
17+
minimum: 0,
18+
maximum: 1,
19+
description: "Fraction of context window that triggers compaction (0.0–1.0)",
20+
},
21+
incrementalMaxDepth: {
22+
type: "integer",
23+
minimum: -1,
24+
description: "How deep incremental compaction goes (0 = leaf only, -1 = unlimited)",
25+
},
26+
freshTailCount: {
27+
type: "integer",
28+
minimum: 1,
29+
description: "Number of recent messages protected from compaction",
30+
},
31+
leafMinFanout: {
32+
type: "integer",
33+
minimum: 2,
34+
},
35+
condensedMinFanout: {
36+
type: "integer",
37+
minimum: 2,
38+
},
39+
condensedMinFanoutHard: {
40+
type: "integer",
41+
minimum: 2,
42+
},
43+
dbPath: {
44+
type: "string",
45+
description: "Path to LCM SQLite database (default: ~/.openclaw/lcm.db)",
46+
},
47+
largeFileThresholdTokens: {
48+
type: "integer",
49+
minimum: 1000,
50+
description: "Token threshold for treating files as 'large'",
51+
},
52+
useTokenizer: {
53+
type: "boolean",
54+
description: "Use precise tokenizer service instead of chars/4 heuristic",
55+
},
56+
proxy: {
57+
type: "string",
58+
description: "HTTP(S) proxy URL for tokenizer downloads from HuggingFace",
59+
},
60+
timezone: {
61+
type: "string",
62+
description: "IANA timezone for timestamps in summaries",
63+
},
64+
pruneHeartbeatOk: {
65+
type: "boolean",
66+
description: "Delete HEARTBEAT_OK turn cycles from LCM storage",
67+
},
68+
autocompactDisabled: {
69+
type: "boolean",
70+
description: "Disable automatic compaction",
71+
},
72+
largeFileSummaryProvider: {
73+
type: "string",
74+
description: "Provider override for large-file summarization",
75+
},
76+
largeFileSummaryModel: {
77+
type: "string",
78+
description: "Model override for large-file summarization",
79+
},
80+
};
81+
82+
// UI hints for the control panel
83+
const UI_HINTS: Record<string, { label: string; help: string }> = {
84+
contextThreshold: {
85+
label: "Context Threshold",
86+
help: "Fraction of context window that triggers compaction (0.0–1.0)",
87+
},
88+
incrementalMaxDepth: {
89+
label: "Incremental Max Depth",
90+
help: "How deep incremental compaction goes (0 = leaf only, -1 = unlimited)",
91+
},
92+
freshTailCount: {
93+
label: "Fresh Tail Count",
94+
help: "Number of recent messages protected from compaction",
95+
},
96+
dbPath: {
97+
label: "Database Path",
98+
help: "Path to LCM SQLite database (default: ~/.openclaw/lcm.db)",
99+
},
100+
useTokenizer: {
101+
label: "Use Precise Tokenizer",
102+
help: "Use HuggingFace tokenizer service instead of chars/4 heuristic",
103+
},
104+
proxy: {
105+
label: "Proxy URL",
106+
help: "HTTP(S) proxy for tokenizer downloads from HuggingFace",
107+
},
108+
};
109+
110+
function generateManifest() {
111+
const manifestPath = join(import.meta.dirname, "..", "openclaw.plugin.json");
112+
113+
// Read existing manifest
114+
let manifest: Record<string, unknown>;
115+
try {
116+
const raw = readFileSync(manifestPath, "utf8");
117+
manifest = JSON.parse(raw);
118+
} catch {
119+
manifest = {};
120+
}
121+
122+
// Update configSchema
123+
manifest.configSchema = {
124+
type: "object",
125+
additionalProperties: false,
126+
properties: CONFIG_SCHEMA_PROPERTIES,
127+
};
128+
129+
// Update uiHints
130+
manifest.uiHints = UI_HINTS;
131+
132+
// Ensure basic fields exist
133+
if (!manifest.id) {
134+
manifest.id = "lossless-claw";
135+
}
136+
137+
// Write back
138+
writeFileSync(manifestPath, JSON.stringify(manifest, null, 2) + "\n", "utf8");
139+
console.log("✓ Generated openclaw.plugin.json configSchema");
140+
console.log(` Properties: ${Object.keys(CONFIG_SCHEMA_PROPERTIES).join(", ")}`);
141+
}
142+
143+
generateManifest();

0 commit comments

Comments
 (0)