feat(memory): add Chinese tokenization support

- Add @node-rs/jieba for high-performance Chinese word segmentation
- Create tokenizer-zh.ts with hasChinese() and tokenizeMixed() functions
- Update buildFtsQuery() to handle mixed Chinese/English text
- Add preprocessTextForFts() for FTS indexing with Chinese text
- Update manager.ts to use preprocessing when writing to FTS table
- Add comprehensive tests for tokenizer and hybrid functions
This commit is contained in:
Jay Liu 2026-01-30 00:11:44 +13:00
parent 5f4715acfc
commit 02b1413bf5
8 changed files with 376 additions and 93 deletions

View File

@ -168,6 +168,7 @@
"@mariozechner/pi-coding-agent": "0.49.3",
"@mariozechner/pi-tui": "0.49.3",
"@mozilla/readability": "^0.6.0",
"@node-rs/jieba": "^1.10.4",
"@sinclair/typebox": "0.34.47",
"@slack/bolt": "^4.6.0",
"@slack/web-api": "^7.13.0",

251
pnpm-lock.yaml generated
View File

@ -55,6 +55,9 @@ importers:
'@mozilla/readability':
specifier: ^0.6.0
version: 0.6.0
'@node-rs/jieba':
specifier: ^1.10.4
version: 1.10.4
'@sinclair/typebox':
specifier: 0.34.47
version: 0.34.47
@ -383,12 +386,12 @@ importers:
'@microsoft/agents-hosting-extensions-teams':
specifier: ^1.2.2
version: 1.2.2
moltbot:
specifier: workspace:*
version: link:../..
express:
specifier: ^5.2.1
version: 5.2.1
moltbot:
specifier: workspace:*
version: link:../..
proper-lockfile:
specifier: ^4.1.2
version: 4.1.2
@ -1557,6 +1560,9 @@ packages:
resolution: {integrity: sha512-/p08f93LEbsL5mDZFQ3DBxcPv/I4QG9EDYRRq1WNlCOXVfAHBTHMSVMwxlqG/AtnSfUr9+vgfN7MKiyDo0+Weg==}
engines: {node: '>= 10'}
'@napi-rs/wasm-runtime@0.2.12':
resolution: {integrity: sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==}
'@napi-rs/wasm-runtime@1.1.1':
resolution: {integrity: sha512-p64ah1M1ld8xjWv3qbvFwHiFVWrq1yFvV4f7w+mzaqiR4IlSgkqhcRdHwsGgomwzBH51sRY4NEowLxnaBjcW/A==}
@ -1658,6 +1664,93 @@ packages:
cpu: [x64]
os: [win32]
'@node-rs/jieba-android-arm-eabi@1.10.4':
resolution: {integrity: sha512-MhyvW5N3Fwcp385d0rxbCWH42kqDBatQTyP8XbnYbju2+0BO/eTeCCLYj7Agws4pwxn2LtdldXRSKavT7WdzNA==}
engines: {node: '>= 10'}
cpu: [arm]
os: [android]
'@node-rs/jieba-android-arm64@1.10.4':
resolution: {integrity: sha512-XyDwq5+rQ+Tk55A+FGi6PtJbzf974oqnpyCcCPzwU3QVXJCa2Rr4Lci+fx8oOpU4plT3GuD+chXMYLsXipMgJA==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [android]
'@node-rs/jieba-darwin-arm64@1.10.4':
resolution: {integrity: sha512-G++RYEJ2jo0rxF9626KUy90wp06TRUjAsvY/BrIzEOX/ingQYV/HjwQzNPRR1P1o32a6/U8RGo7zEBhfdybL6w==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [darwin]
'@node-rs/jieba-darwin-x64@1.10.4':
resolution: {integrity: sha512-MmDNeOb2TXIZCPyWCi2upQnZpPjAxw5ZGEj6R8kNsPXVFALHIKMa6ZZ15LCOkSTsKXVC17j2t4h+hSuyYb6qfQ==}
engines: {node: '>= 10'}
cpu: [x64]
os: [darwin]
'@node-rs/jieba-freebsd-x64@1.10.4':
resolution: {integrity: sha512-/x7aVQ8nqUWhpXU92RZqd333cq639i/olNpd9Z5hdlyyV5/B65LLy+Je2B2bfs62PVVm5QXRpeBcZqaHelp/bg==}
engines: {node: '>= 10'}
cpu: [x64]
os: [freebsd]
'@node-rs/jieba-linux-arm-gnueabihf@1.10.4':
resolution: {integrity: sha512-crd2M35oJBRLkoESs0O6QO3BBbhpv+tqXuKsqhIG94B1d02RVxtRIvSDwO33QurxqSdvN9IeSnVpHbDGkuXm3g==}
engines: {node: '>= 10'}
cpu: [arm]
os: [linux]
'@node-rs/jieba-linux-arm64-gnu@1.10.4':
resolution: {integrity: sha512-omIzNX1psUzPcsdnUhGU6oHeOaTCuCjUgOA/v/DGkvWC1jLcnfXe4vdYbtXMh4XOCuIgS1UCcvZEc8vQLXFbXQ==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [linux]
'@node-rs/jieba-linux-arm64-musl@1.10.4':
resolution: {integrity: sha512-Y/tiJ1+HeS5nnmLbZOE+66LbsPOHZ/PUckAYVeLlQfpygLEpLYdlh0aPpS5uiaWMjAXYZYdFkpZHhxDmSLpwpw==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [linux]
'@node-rs/jieba-linux-x64-gnu@1.10.4':
resolution: {integrity: sha512-WZO8ykRJpWGE9MHuZpy1lu3nJluPoeB+fIJJn5CWZ9YTVhNDWoCF4i/7nxz1ntulINYGQ8VVuCU9LD86Mek97g==}
engines: {node: '>= 10'}
cpu: [x64]
os: [linux]
'@node-rs/jieba-linux-x64-musl@1.10.4':
resolution: {integrity: sha512-uBBD4S1rGKcgCyAk6VCKatEVQb6EDD5I40v/DxODi5CuZVCANi9m5oee/MQbAoaX7RydA2f0OSCE9/tcwXEwUg==}
engines: {node: '>= 10'}
cpu: [x64]
os: [linux]
'@node-rs/jieba-wasm32-wasi@1.10.4':
resolution: {integrity: sha512-Y2umiKHjuIJy0uulNDz9SDYHdfq5Hmy7jY5nORO99B4pySKkcrMjpeVrmWXJLIsEKLJwcCXHxz8tjwU5/uhz0A==}
engines: {node: '>=14.0.0'}
cpu: [wasm32]
'@node-rs/jieba-win32-arm64-msvc@1.10.4':
resolution: {integrity: sha512-nwMtViFm4hjqhz1it/juQnxpXgqlGltCuWJ02bw70YUDMDlbyTy3grCJPpQQpueeETcALUnTxda8pZuVrLRcBA==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [win32]
'@node-rs/jieba-win32-ia32-msvc@1.10.4':
resolution: {integrity: sha512-DCAvLx7Z+W4z5oKS+7vUowAJr0uw9JBw8x1Y23Xs/xMA4Em+OOSiaF5/tCJqZUCJ8uC4QeImmgDFiBqGNwxlyA==}
engines: {node: '>= 10'}
cpu: [ia32]
os: [win32]
'@node-rs/jieba-win32-x64-msvc@1.10.4':
resolution: {integrity: sha512-+sqemSfS1jjb+Tt7InNbNzrRh1Ua3vProVvC4BZRPg010/leCbGFFiQHpzcPRfpxAXZrzG5Y0YBTsPzN/I4yHQ==}
engines: {node: '>= 10'}
cpu: [x64]
os: [win32]
'@node-rs/jieba@1.10.4':
resolution: {integrity: sha512-GvDgi8MnBiyWd6tksojej8anIx18244NmIOc1ovEw8WKNUejcccLfyu8vj66LWSuoZuKILVtNsOy4jvg3aoxIw==}
engines: {node: '>= 10'}
'@nodelib/fs.scandir@2.1.5':
resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==}
engines: {node: '>= 8'}
@ -3214,11 +3307,6 @@ packages:
class-variance-authority@0.7.1:
resolution: {integrity: sha512-Ka+9Trutv7G8M6WT6SeiRWz792K5qEqIGEGzXKhAE6xOWAY6pPH8U+9IY3oCMv6kqTmLsv7Xh/2w2RigkePMsg==}
clawdbot@2026.1.24-3:
resolution: {integrity: sha512-zt9BzhWXduq8ZZR4rfzQDurQWAgmijTTyPZCQGrn5ew6wCEwhxxEr2/NHG7IlCwcfRsKymsY4se9KMhoNz0JtQ==}
engines: {node: '>=22.12.0'}
hasBin: true
cli-cursor@5.0.0:
resolution: {integrity: sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw==}
engines: {node: '>=18'}
@ -5193,6 +5281,7 @@ packages:
tar@7.5.4:
resolution: {integrity: sha512-AN04xbWGrSTDmVwlI4/GTlIIwMFk/XEv7uL8aa57zuvRy6s4hdBed+lVq2fAZ89XDa7Us3ANXcE3Tvqvja1kTA==}
engines: {node: '>=18'}
deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exhorbitant rates) by contacting i@izs.me
thenify-all@1.6.0:
resolution: {integrity: sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==}
@ -7160,6 +7249,13 @@ snapshots:
'@napi-rs/canvas-win32-x64-msvc': 0.1.88
optional: true
'@napi-rs/wasm-runtime@0.2.12':
dependencies:
'@emnapi/core': 1.8.1
'@emnapi/runtime': 1.8.1
'@tybys/wasm-util': 0.10.1
optional: true
'@napi-rs/wasm-runtime@1.1.1':
dependencies:
'@emnapi/core': 1.8.1
@ -7222,6 +7318,67 @@ snapshots:
'@node-llama-cpp/win-x64@3.15.0':
optional: true
'@node-rs/jieba-android-arm-eabi@1.10.4':
optional: true
'@node-rs/jieba-android-arm64@1.10.4':
optional: true
'@node-rs/jieba-darwin-arm64@1.10.4':
optional: true
'@node-rs/jieba-darwin-x64@1.10.4':
optional: true
'@node-rs/jieba-freebsd-x64@1.10.4':
optional: true
'@node-rs/jieba-linux-arm-gnueabihf@1.10.4':
optional: true
'@node-rs/jieba-linux-arm64-gnu@1.10.4':
optional: true
'@node-rs/jieba-linux-arm64-musl@1.10.4':
optional: true
'@node-rs/jieba-linux-x64-gnu@1.10.4':
optional: true
'@node-rs/jieba-linux-x64-musl@1.10.4':
optional: true
'@node-rs/jieba-wasm32-wasi@1.10.4':
dependencies:
'@napi-rs/wasm-runtime': 0.2.12
optional: true
'@node-rs/jieba-win32-arm64-msvc@1.10.4':
optional: true
'@node-rs/jieba-win32-ia32-msvc@1.10.4':
optional: true
'@node-rs/jieba-win32-x64-msvc@1.10.4':
optional: true
'@node-rs/jieba@1.10.4':
optionalDependencies:
'@node-rs/jieba-android-arm-eabi': 1.10.4
'@node-rs/jieba-android-arm64': 1.10.4
'@node-rs/jieba-darwin-arm64': 1.10.4
'@node-rs/jieba-darwin-x64': 1.10.4
'@node-rs/jieba-freebsd-x64': 1.10.4
'@node-rs/jieba-linux-arm-gnueabihf': 1.10.4
'@node-rs/jieba-linux-arm64-gnu': 1.10.4
'@node-rs/jieba-linux-arm64-musl': 1.10.4
'@node-rs/jieba-linux-x64-gnu': 1.10.4
'@node-rs/jieba-linux-x64-musl': 1.10.4
'@node-rs/jieba-wasm32-wasi': 1.10.4
'@node-rs/jieba-win32-arm64-msvc': 1.10.4
'@node-rs/jieba-win32-ia32-msvc': 1.10.4
'@node-rs/jieba-win32-x64-msvc': 1.10.4
'@nodelib/fs.scandir@2.1.5':
dependencies:
'@nodelib/fs.stat': 2.0.5
@ -9098,84 +9255,6 @@ snapshots:
dependencies:
clsx: 2.1.1
clawdbot@2026.1.24-3(@types/express@5.0.6)(audio-decode@2.2.3)(devtools-protocol@0.0.1561482)(typescript@5.9.3):
dependencies:
'@agentclientprotocol/sdk': 0.13.1(zod@4.3.6)
'@aws-sdk/client-bedrock': 3.975.0
'@buape/carbon': 0.14.0(hono@4.11.4)
'@clack/prompts': 0.11.0
'@grammyjs/runner': 2.0.3(grammy@1.39.3)
'@grammyjs/transformer-throttler': 1.2.1(grammy@1.39.3)
'@homebridge/ciao': 1.3.4
'@line/bot-sdk': 10.6.0
'@lydell/node-pty': 1.2.0-beta.3
'@mariozechner/pi-agent-core': 0.49.3(ws@8.19.0)(zod@4.3.6)
'@mariozechner/pi-ai': 0.49.3(ws@8.19.0)(zod@4.3.6)
'@mariozechner/pi-coding-agent': 0.49.3(ws@8.19.0)(zod@4.3.6)
'@mariozechner/pi-tui': 0.49.3
'@mozilla/readability': 0.6.0
'@sinclair/typebox': 0.34.47
'@slack/bolt': 4.6.0(@types/express@5.0.6)
'@slack/web-api': 7.13.0
'@whiskeysockets/baileys': 7.0.0-rc.9(audio-decode@2.2.3)(sharp@0.34.5)
ajv: 8.17.1
body-parser: 2.2.2
chalk: 5.6.2
chokidar: 5.0.0
chromium-bidi: 13.0.1(devtools-protocol@0.0.1561482)
cli-highlight: 2.1.11
commander: 14.0.2
croner: 9.1.0
detect-libc: 2.1.2
discord-api-types: 0.38.37
dotenv: 17.2.3
express: 5.2.1
file-type: 21.3.0
grammy: 1.39.3
hono: 4.11.4
jiti: 2.6.1
json5: 2.2.3
jszip: 3.10.1
linkedom: 0.18.12
long: 5.3.2
markdown-it: 14.1.0
node-edge-tts: 1.2.9
osc-progress: 0.3.0
pdfjs-dist: 5.4.530
playwright-core: 1.58.0
proper-lockfile: 4.1.2
qrcode-terminal: 0.12.0
sharp: 0.34.5
sqlite-vec: 0.1.7-alpha.2
tar: 7.5.4
tslog: 4.10.2
undici: 7.19.0
ws: 8.19.0
yaml: 2.8.2
zod: 4.3.6
optionalDependencies:
'@napi-rs/canvas': 0.1.88
node-llama-cpp: 3.15.0(typescript@5.9.3)
transitivePeerDependencies:
- '@discordjs/opus'
- '@modelcontextprotocol/sdk'
- '@types/express'
- audio-decode
- aws-crt
- bufferutil
- canvas
- debug
- devtools-protocol
- encoding
- ffmpeg-static
- jimp
- link-preview-js
- node-opus
- opusscript
- supports-color
- typescript
- utf-8-validate
cli-cursor@5.0.0:
dependencies:
restore-cursor: 5.1.0

View File

@ -9,6 +9,33 @@ describe("memory hybrid helpers", () => {
expect(buildFtsQuery(" ")).toBeNull();
});
it("buildFtsQuery handles Chinese text with tokenization", () => {
// 纯中文测试 - 分词后应有多个 token
const result = buildFtsQuery("你好世界");
// 分词结果不应为 null
expect(result).not.toBeNull();
// 如果有中文分词结果,应该包含 AND
if (result) {
expect(result).toContain("AND");
}
});
it("buildFtsQuery handles mixed Chinese and English text", () => {
const mixedResult = buildFtsQuery("你好 hello world 世界");
expect(mixedResult).not.toBeNull();
expect(mixedResult).toContain("AND");
expect(mixedResult).toContain('"hello"');
expect(mixedResult).toContain('"world"');
});
it("buildFtsQuery deduplicates tokens", () => {
const result = buildFtsQuery("hello hello world");
expect(result).not.toBeNull();
// "hello" should only appear once
const helloCount = (result!.match(/"hello"/g) || []).length;
expect(helloCount).toBe(1);
});
it("bm25RankToScore is monotonic and clamped", () => {
expect(bm25RankToScore(0)).toBeCloseTo(1);
expect(bm25RankToScore(1)).toBeCloseTo(0.5);

View File

@ -1,3 +1,5 @@
import { hasChinese, tokenizeMixed } from "./tokenizer-zh.js";
export type HybridSource = string;
export type HybridVectorResult = {
@ -21,13 +23,30 @@ export type HybridKeywordResult = {
};
export function buildFtsQuery(raw: string): string | null {
const tokens =
raw
.match(/[A-Za-z0-9_]+/g)
?.map((t) => t.trim())
.filter(Boolean) ?? [];
if (!raw || typeof raw !== "string") return null;
const trimmed = raw.trim();
if (!trimmed) return null;
let tokens: string[];
if (hasChinese(trimmed)) {
// 中文文本使用中文分词
tokens = tokenizeMixed(trimmed);
} else {
// 英文/数字文本使用原有的正则提取
tokens =
trimmed
.match(/[A-Za-z0-9_]+/g)
?.map((t) => t.trim())
.filter(Boolean) ?? [];
}
if (tokens.length === 0) return null;
const quoted = tokens.map((t) => `"${t.replaceAll('"', "")}"`);
// 去重并保持顺序
const uniqueTokens = [...new Set(tokens)];
const quoted = uniqueTokens.map((t) => `"${t.replaceAll('"', "")}"`);
return quoted.join(" AND ");
}

View File

@ -3,6 +3,8 @@ import fsSync from "node:fs";
import fs from "node:fs/promises";
import path from "node:path";
import { tokenizeMixed, hasChinese } from "./tokenizer-zh.js";
export type MemoryFileEntry = {
path: string;
absPath: string;
@ -239,3 +241,27 @@ export function cosineSimilarity(a: number[], b: number[]): number {
if (normA === 0 || normB === 0) return 0;
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}
/**
* FTS
* /
*/
export function preprocessTextForFts(text: string): string {
if (!text || typeof text !== "string") {
return "";
}
// 如果不包含中文,直接返回原文本
if (!hasChinese(text)) {
return text;
}
// 对混合文本进行分词
const tokens = tokenizeMixed(text);
if (tokens.length === 0) {
return text;
}
// 返回分词后的文本,用空格连接
return tokens.join(" ");
}

View File

@ -37,6 +37,7 @@ import {
isMemoryPath,
listMemoryFiles,
normalizeExtraMemoryPaths,
preprocessTextForFts,
type MemoryChunk,
type MemoryFileEntry,
parseEmbedding,
@ -2202,13 +2203,15 @@ export class MemoryIndexManager {
.run(id, vectorToBlob(embedding));
}
if (this.fts.enabled && this.fts.available) {
// 预处理中文文本用于 FTS 索引
const ftsText = preprocessTextForFts(chunk.text);
this.db
.prepare(
`INSERT INTO ${FTS_TABLE} (text, id, path, source, model, start_line, end_line)\n` +
` VALUES (?, ?, ?, ?, ?, ?, ?)`,
)
.run(
chunk.text,
ftsText,
id,
entry.path,
options.source,

View File

@ -0,0 +1,67 @@
import { describe, it, expect } from "vitest";
import { hasChinese, tokenizeMixed } from "./tokenizer-zh.js";
describe("Chinese tokenizer", () => {
describe("hasChinese", () => {
it("detects Chinese characters", () => {
expect(hasChinese("你好")).toBe(true);
expect(hasChinese("世界")).toBe(true);
expect(hasChinese("你好世界")).toBe(true);
expect(hasChinese("中文测试")).toBe(true);
});
it("returns false for pure English text", () => {
expect(hasChinese("hello world")).toBe(false);
expect(hasChinese("test")).toBe(false);
expect(hasChinese("")).toBe(false);
});
it("returns true for mixed Chinese and English", () => {
expect(hasChinese("你好 hello")).toBe(true);
expect(hasChinese("hello 世界")).toBe(true);
});
it("handles empty and invalid input", () => {
expect(hasChinese("")).toBe(false);
expect(hasChinese(null as unknown as string)).toBe(false);
expect(hasChinese(undefined as unknown as string)).toBe(false);
});
});
describe("tokenizeMixed", () => {
it("tokenizes pure Chinese text", () => {
const result = tokenizeMixed("你好世界");
expect(result.length).toBeGreaterThan(1);
expect(result).toContain("你好");
expect(result).toContain("世界");
});
it("tokenizes Chinese sentence", () => {
const result = tokenizeMixed("今天天气很好");
expect(result.length).toBeGreaterThan(0);
// "今天" 和 "天气" 应该被分出来
expect(result.some((t) => t.includes("今天") || t.includes("天"))).toBe(true);
});
it("extracts English tokens from pure English text", () => {
const result = tokenizeMixed("hello world test");
expect(result).toContain("hello");
expect(result).toContain("world");
expect(result).toContain("test");
});
it("tokenizes mixed Chinese and English text", () => {
const result = tokenizeMixed("你好 hello world 世界");
expect(result.length).toBeGreaterThan(2);
expect(result).toContain("你好");
expect(result).toContain("hello");
expect(result).toContain("world");
expect(result).toContain("世界");
});
it("handles empty input", () => {
expect(tokenizeMixed("")).toEqual([]);
expect(tokenizeMixed(null as unknown as string)).toEqual([]);
});
});
});

View File

@ -0,0 +1,61 @@
import { cut, load } from "@node-rs/jieba";
// 初始化 jieba 词典(在模块加载时自动加载)
load();
// 标点符号正则表达式(只包含英文和中文标点,不包含中文字符)
const punctuationPattern =
/^[\u0020-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u007e\u3000-\u303f\ufe10-\ufe1f\ufe30-\ufe44\ufe50-\ufe6b\uff00-\uff60\uffe0-\uffe6]+$/;
/**
*
*/
export function hasChinese(text: string): boolean {
if (!text || typeof text !== "string") return false;
// 检测中文字符范围
return /[\u4e00-\u9fa5]/.test(text);
}
/**
*
* 使 jieba
*/
export function tokenizeMixed(text: string): string[] {
if (!text || typeof text !== "string") return [];
// 如果不包含中文,只提取英文和数字
if (!hasChinese(text)) {
return (
text
.match(/[A-Za-z0-9_]+/g)
?.map((t) => t.trim())
.filter(Boolean) ?? []
);
}
// 使用 jieba 对整段文本进行分词hmm=false 避免将数字英文拆开
const tokens = cut(text, false);
// jieba 分词结果可能包含标点符号,我们只保留有意义的 token
const result: string[] = [];
for (const token of tokens) {
// 跳过空字符串
if (!token || token.length === 0) continue;
// 去除 token 中的前后空白
const trimmed = token.trim();
// 过滤掉纯标点符号和空白 token
if (trimmed.length === 0) continue;
// 跳过纯标点符号(不包括中文字符)
if (punctuationPattern.test(trimmed)) continue;
result.push(trimmed);
}
// 去重
const uniqueTokens = [...new Set(result)];
return uniqueTokens;
}