feat(memory): add Chinese tokenization support
- Add @node-rs/jieba for high-performance Chinese word segmentation - Create tokenizer-zh.ts with hasChinese() and tokenizeMixed() functions - Update buildFtsQuery() to handle mixed Chinese/English text - Add preprocessTextForFts() for FTS indexing with Chinese text - Update manager.ts to use preprocessing when writing to FTS table - Add comprehensive tests for tokenizer and hybrid functions
This commit is contained in:
parent
5f4715acfc
commit
02b1413bf5
@ -168,6 +168,7 @@
|
||||
"@mariozechner/pi-coding-agent": "0.49.3",
|
||||
"@mariozechner/pi-tui": "0.49.3",
|
||||
"@mozilla/readability": "^0.6.0",
|
||||
"@node-rs/jieba": "^1.10.4",
|
||||
"@sinclair/typebox": "0.34.47",
|
||||
"@slack/bolt": "^4.6.0",
|
||||
"@slack/web-api": "^7.13.0",
|
||||
|
||||
251
pnpm-lock.yaml
generated
251
pnpm-lock.yaml
generated
@ -55,6 +55,9 @@ importers:
|
||||
'@mozilla/readability':
|
||||
specifier: ^0.6.0
|
||||
version: 0.6.0
|
||||
'@node-rs/jieba':
|
||||
specifier: ^1.10.4
|
||||
version: 1.10.4
|
||||
'@sinclair/typebox':
|
||||
specifier: 0.34.47
|
||||
version: 0.34.47
|
||||
@ -383,12 +386,12 @@ importers:
|
||||
'@microsoft/agents-hosting-extensions-teams':
|
||||
specifier: ^1.2.2
|
||||
version: 1.2.2
|
||||
moltbot:
|
||||
specifier: workspace:*
|
||||
version: link:../..
|
||||
express:
|
||||
specifier: ^5.2.1
|
||||
version: 5.2.1
|
||||
moltbot:
|
||||
specifier: workspace:*
|
||||
version: link:../..
|
||||
proper-lockfile:
|
||||
specifier: ^4.1.2
|
||||
version: 4.1.2
|
||||
@ -1557,6 +1560,9 @@ packages:
|
||||
resolution: {integrity: sha512-/p08f93LEbsL5mDZFQ3DBxcPv/I4QG9EDYRRq1WNlCOXVfAHBTHMSVMwxlqG/AtnSfUr9+vgfN7MKiyDo0+Weg==}
|
||||
engines: {node: '>= 10'}
|
||||
|
||||
'@napi-rs/wasm-runtime@0.2.12':
|
||||
resolution: {integrity: sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==}
|
||||
|
||||
'@napi-rs/wasm-runtime@1.1.1':
|
||||
resolution: {integrity: sha512-p64ah1M1ld8xjWv3qbvFwHiFVWrq1yFvV4f7w+mzaqiR4IlSgkqhcRdHwsGgomwzBH51sRY4NEowLxnaBjcW/A==}
|
||||
|
||||
@ -1658,6 +1664,93 @@ packages:
|
||||
cpu: [x64]
|
||||
os: [win32]
|
||||
|
||||
'@node-rs/jieba-android-arm-eabi@1.10.4':
|
||||
resolution: {integrity: sha512-MhyvW5N3Fwcp385d0rxbCWH42kqDBatQTyP8XbnYbju2+0BO/eTeCCLYj7Agws4pwxn2LtdldXRSKavT7WdzNA==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm]
|
||||
os: [android]
|
||||
|
||||
'@node-rs/jieba-android-arm64@1.10.4':
|
||||
resolution: {integrity: sha512-XyDwq5+rQ+Tk55A+FGi6PtJbzf974oqnpyCcCPzwU3QVXJCa2Rr4Lci+fx8oOpU4plT3GuD+chXMYLsXipMgJA==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [android]
|
||||
|
||||
'@node-rs/jieba-darwin-arm64@1.10.4':
|
||||
resolution: {integrity: sha512-G++RYEJ2jo0rxF9626KUy90wp06TRUjAsvY/BrIzEOX/ingQYV/HjwQzNPRR1P1o32a6/U8RGo7zEBhfdybL6w==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [darwin]
|
||||
|
||||
'@node-rs/jieba-darwin-x64@1.10.4':
|
||||
resolution: {integrity: sha512-MmDNeOb2TXIZCPyWCi2upQnZpPjAxw5ZGEj6R8kNsPXVFALHIKMa6ZZ15LCOkSTsKXVC17j2t4h+hSuyYb6qfQ==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [darwin]
|
||||
|
||||
'@node-rs/jieba-freebsd-x64@1.10.4':
|
||||
resolution: {integrity: sha512-/x7aVQ8nqUWhpXU92RZqd333cq639i/olNpd9Z5hdlyyV5/B65LLy+Je2B2bfs62PVVm5QXRpeBcZqaHelp/bg==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [freebsd]
|
||||
|
||||
'@node-rs/jieba-linux-arm-gnueabihf@1.10.4':
|
||||
resolution: {integrity: sha512-crd2M35oJBRLkoESs0O6QO3BBbhpv+tqXuKsqhIG94B1d02RVxtRIvSDwO33QurxqSdvN9IeSnVpHbDGkuXm3g==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm]
|
||||
os: [linux]
|
||||
|
||||
'@node-rs/jieba-linux-arm64-gnu@1.10.4':
|
||||
resolution: {integrity: sha512-omIzNX1psUzPcsdnUhGU6oHeOaTCuCjUgOA/v/DGkvWC1jLcnfXe4vdYbtXMh4XOCuIgS1UCcvZEc8vQLXFbXQ==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [linux]
|
||||
|
||||
'@node-rs/jieba-linux-arm64-musl@1.10.4':
|
||||
resolution: {integrity: sha512-Y/tiJ1+HeS5nnmLbZOE+66LbsPOHZ/PUckAYVeLlQfpygLEpLYdlh0aPpS5uiaWMjAXYZYdFkpZHhxDmSLpwpw==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [linux]
|
||||
|
||||
'@node-rs/jieba-linux-x64-gnu@1.10.4':
|
||||
resolution: {integrity: sha512-WZO8ykRJpWGE9MHuZpy1lu3nJluPoeB+fIJJn5CWZ9YTVhNDWoCF4i/7nxz1ntulINYGQ8VVuCU9LD86Mek97g==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [linux]
|
||||
|
||||
'@node-rs/jieba-linux-x64-musl@1.10.4':
|
||||
resolution: {integrity: sha512-uBBD4S1rGKcgCyAk6VCKatEVQb6EDD5I40v/DxODi5CuZVCANi9m5oee/MQbAoaX7RydA2f0OSCE9/tcwXEwUg==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [linux]
|
||||
|
||||
'@node-rs/jieba-wasm32-wasi@1.10.4':
|
||||
resolution: {integrity: sha512-Y2umiKHjuIJy0uulNDz9SDYHdfq5Hmy7jY5nORO99B4pySKkcrMjpeVrmWXJLIsEKLJwcCXHxz8tjwU5/uhz0A==}
|
||||
engines: {node: '>=14.0.0'}
|
||||
cpu: [wasm32]
|
||||
|
||||
'@node-rs/jieba-win32-arm64-msvc@1.10.4':
|
||||
resolution: {integrity: sha512-nwMtViFm4hjqhz1it/juQnxpXgqlGltCuWJ02bw70YUDMDlbyTy3grCJPpQQpueeETcALUnTxda8pZuVrLRcBA==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [arm64]
|
||||
os: [win32]
|
||||
|
||||
'@node-rs/jieba-win32-ia32-msvc@1.10.4':
|
||||
resolution: {integrity: sha512-DCAvLx7Z+W4z5oKS+7vUowAJr0uw9JBw8x1Y23Xs/xMA4Em+OOSiaF5/tCJqZUCJ8uC4QeImmgDFiBqGNwxlyA==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [ia32]
|
||||
os: [win32]
|
||||
|
||||
'@node-rs/jieba-win32-x64-msvc@1.10.4':
|
||||
resolution: {integrity: sha512-+sqemSfS1jjb+Tt7InNbNzrRh1Ua3vProVvC4BZRPg010/leCbGFFiQHpzcPRfpxAXZrzG5Y0YBTsPzN/I4yHQ==}
|
||||
engines: {node: '>= 10'}
|
||||
cpu: [x64]
|
||||
os: [win32]
|
||||
|
||||
'@node-rs/jieba@1.10.4':
|
||||
resolution: {integrity: sha512-GvDgi8MnBiyWd6tksojej8anIx18244NmIOc1ovEw8WKNUejcccLfyu8vj66LWSuoZuKILVtNsOy4jvg3aoxIw==}
|
||||
engines: {node: '>= 10'}
|
||||
|
||||
'@nodelib/fs.scandir@2.1.5':
|
||||
resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==}
|
||||
engines: {node: '>= 8'}
|
||||
@ -3214,11 +3307,6 @@ packages:
|
||||
class-variance-authority@0.7.1:
|
||||
resolution: {integrity: sha512-Ka+9Trutv7G8M6WT6SeiRWz792K5qEqIGEGzXKhAE6xOWAY6pPH8U+9IY3oCMv6kqTmLsv7Xh/2w2RigkePMsg==}
|
||||
|
||||
clawdbot@2026.1.24-3:
|
||||
resolution: {integrity: sha512-zt9BzhWXduq8ZZR4rfzQDurQWAgmijTTyPZCQGrn5ew6wCEwhxxEr2/NHG7IlCwcfRsKymsY4se9KMhoNz0JtQ==}
|
||||
engines: {node: '>=22.12.0'}
|
||||
hasBin: true
|
||||
|
||||
cli-cursor@5.0.0:
|
||||
resolution: {integrity: sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw==}
|
||||
engines: {node: '>=18'}
|
||||
@ -5193,6 +5281,7 @@ packages:
|
||||
tar@7.5.4:
|
||||
resolution: {integrity: sha512-AN04xbWGrSTDmVwlI4/GTlIIwMFk/XEv7uL8aa57zuvRy6s4hdBed+lVq2fAZ89XDa7Us3ANXcE3Tvqvja1kTA==}
|
||||
engines: {node: '>=18'}
|
||||
deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exhorbitant rates) by contacting i@izs.me
|
||||
|
||||
thenify-all@1.6.0:
|
||||
resolution: {integrity: sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==}
|
||||
@ -7160,6 +7249,13 @@ snapshots:
|
||||
'@napi-rs/canvas-win32-x64-msvc': 0.1.88
|
||||
optional: true
|
||||
|
||||
'@napi-rs/wasm-runtime@0.2.12':
|
||||
dependencies:
|
||||
'@emnapi/core': 1.8.1
|
||||
'@emnapi/runtime': 1.8.1
|
||||
'@tybys/wasm-util': 0.10.1
|
||||
optional: true
|
||||
|
||||
'@napi-rs/wasm-runtime@1.1.1':
|
||||
dependencies:
|
||||
'@emnapi/core': 1.8.1
|
||||
@ -7222,6 +7318,67 @@ snapshots:
|
||||
'@node-llama-cpp/win-x64@3.15.0':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-android-arm-eabi@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-android-arm64@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-darwin-arm64@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-darwin-x64@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-freebsd-x64@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-linux-arm-gnueabihf@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-linux-arm64-gnu@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-linux-arm64-musl@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-linux-x64-gnu@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-linux-x64-musl@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-wasm32-wasi@1.10.4':
|
||||
dependencies:
|
||||
'@napi-rs/wasm-runtime': 0.2.12
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-win32-arm64-msvc@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-win32-ia32-msvc@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba-win32-x64-msvc@1.10.4':
|
||||
optional: true
|
||||
|
||||
'@node-rs/jieba@1.10.4':
|
||||
optionalDependencies:
|
||||
'@node-rs/jieba-android-arm-eabi': 1.10.4
|
||||
'@node-rs/jieba-android-arm64': 1.10.4
|
||||
'@node-rs/jieba-darwin-arm64': 1.10.4
|
||||
'@node-rs/jieba-darwin-x64': 1.10.4
|
||||
'@node-rs/jieba-freebsd-x64': 1.10.4
|
||||
'@node-rs/jieba-linux-arm-gnueabihf': 1.10.4
|
||||
'@node-rs/jieba-linux-arm64-gnu': 1.10.4
|
||||
'@node-rs/jieba-linux-arm64-musl': 1.10.4
|
||||
'@node-rs/jieba-linux-x64-gnu': 1.10.4
|
||||
'@node-rs/jieba-linux-x64-musl': 1.10.4
|
||||
'@node-rs/jieba-wasm32-wasi': 1.10.4
|
||||
'@node-rs/jieba-win32-arm64-msvc': 1.10.4
|
||||
'@node-rs/jieba-win32-ia32-msvc': 1.10.4
|
||||
'@node-rs/jieba-win32-x64-msvc': 1.10.4
|
||||
|
||||
'@nodelib/fs.scandir@2.1.5':
|
||||
dependencies:
|
||||
'@nodelib/fs.stat': 2.0.5
|
||||
@ -9098,84 +9255,6 @@ snapshots:
|
||||
dependencies:
|
||||
clsx: 2.1.1
|
||||
|
||||
clawdbot@2026.1.24-3(@types/express@5.0.6)(audio-decode@2.2.3)(devtools-protocol@0.0.1561482)(typescript@5.9.3):
|
||||
dependencies:
|
||||
'@agentclientprotocol/sdk': 0.13.1(zod@4.3.6)
|
||||
'@aws-sdk/client-bedrock': 3.975.0
|
||||
'@buape/carbon': 0.14.0(hono@4.11.4)
|
||||
'@clack/prompts': 0.11.0
|
||||
'@grammyjs/runner': 2.0.3(grammy@1.39.3)
|
||||
'@grammyjs/transformer-throttler': 1.2.1(grammy@1.39.3)
|
||||
'@homebridge/ciao': 1.3.4
|
||||
'@line/bot-sdk': 10.6.0
|
||||
'@lydell/node-pty': 1.2.0-beta.3
|
||||
'@mariozechner/pi-agent-core': 0.49.3(ws@8.19.0)(zod@4.3.6)
|
||||
'@mariozechner/pi-ai': 0.49.3(ws@8.19.0)(zod@4.3.6)
|
||||
'@mariozechner/pi-coding-agent': 0.49.3(ws@8.19.0)(zod@4.3.6)
|
||||
'@mariozechner/pi-tui': 0.49.3
|
||||
'@mozilla/readability': 0.6.0
|
||||
'@sinclair/typebox': 0.34.47
|
||||
'@slack/bolt': 4.6.0(@types/express@5.0.6)
|
||||
'@slack/web-api': 7.13.0
|
||||
'@whiskeysockets/baileys': 7.0.0-rc.9(audio-decode@2.2.3)(sharp@0.34.5)
|
||||
ajv: 8.17.1
|
||||
body-parser: 2.2.2
|
||||
chalk: 5.6.2
|
||||
chokidar: 5.0.0
|
||||
chromium-bidi: 13.0.1(devtools-protocol@0.0.1561482)
|
||||
cli-highlight: 2.1.11
|
||||
commander: 14.0.2
|
||||
croner: 9.1.0
|
||||
detect-libc: 2.1.2
|
||||
discord-api-types: 0.38.37
|
||||
dotenv: 17.2.3
|
||||
express: 5.2.1
|
||||
file-type: 21.3.0
|
||||
grammy: 1.39.3
|
||||
hono: 4.11.4
|
||||
jiti: 2.6.1
|
||||
json5: 2.2.3
|
||||
jszip: 3.10.1
|
||||
linkedom: 0.18.12
|
||||
long: 5.3.2
|
||||
markdown-it: 14.1.0
|
||||
node-edge-tts: 1.2.9
|
||||
osc-progress: 0.3.0
|
||||
pdfjs-dist: 5.4.530
|
||||
playwright-core: 1.58.0
|
||||
proper-lockfile: 4.1.2
|
||||
qrcode-terminal: 0.12.0
|
||||
sharp: 0.34.5
|
||||
sqlite-vec: 0.1.7-alpha.2
|
||||
tar: 7.5.4
|
||||
tslog: 4.10.2
|
||||
undici: 7.19.0
|
||||
ws: 8.19.0
|
||||
yaml: 2.8.2
|
||||
zod: 4.3.6
|
||||
optionalDependencies:
|
||||
'@napi-rs/canvas': 0.1.88
|
||||
node-llama-cpp: 3.15.0(typescript@5.9.3)
|
||||
transitivePeerDependencies:
|
||||
- '@discordjs/opus'
|
||||
- '@modelcontextprotocol/sdk'
|
||||
- '@types/express'
|
||||
- audio-decode
|
||||
- aws-crt
|
||||
- bufferutil
|
||||
- canvas
|
||||
- debug
|
||||
- devtools-protocol
|
||||
- encoding
|
||||
- ffmpeg-static
|
||||
- jimp
|
||||
- link-preview-js
|
||||
- node-opus
|
||||
- opusscript
|
||||
- supports-color
|
||||
- typescript
|
||||
- utf-8-validate
|
||||
|
||||
cli-cursor@5.0.0:
|
||||
dependencies:
|
||||
restore-cursor: 5.1.0
|
||||
|
||||
@ -9,6 +9,33 @@ describe("memory hybrid helpers", () => {
|
||||
expect(buildFtsQuery(" ")).toBeNull();
|
||||
});
|
||||
|
||||
it("buildFtsQuery handles Chinese text with tokenization", () => {
|
||||
// 纯中文测试 - 分词后应有多个 token
|
||||
const result = buildFtsQuery("你好世界");
|
||||
// 分词结果不应为 null
|
||||
expect(result).not.toBeNull();
|
||||
// 如果有中文分词结果,应该包含 AND
|
||||
if (result) {
|
||||
expect(result).toContain("AND");
|
||||
}
|
||||
});
|
||||
|
||||
it("buildFtsQuery handles mixed Chinese and English text", () => {
|
||||
const mixedResult = buildFtsQuery("你好 hello world 世界");
|
||||
expect(mixedResult).not.toBeNull();
|
||||
expect(mixedResult).toContain("AND");
|
||||
expect(mixedResult).toContain('"hello"');
|
||||
expect(mixedResult).toContain('"world"');
|
||||
});
|
||||
|
||||
it("buildFtsQuery deduplicates tokens", () => {
|
||||
const result = buildFtsQuery("hello hello world");
|
||||
expect(result).not.toBeNull();
|
||||
// "hello" should only appear once
|
||||
const helloCount = (result!.match(/"hello"/g) || []).length;
|
||||
expect(helloCount).toBe(1);
|
||||
});
|
||||
|
||||
it("bm25RankToScore is monotonic and clamped", () => {
|
||||
expect(bm25RankToScore(0)).toBeCloseTo(1);
|
||||
expect(bm25RankToScore(1)).toBeCloseTo(0.5);
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
import { hasChinese, tokenizeMixed } from "./tokenizer-zh.js";
|
||||
|
||||
export type HybridSource = string;
|
||||
|
||||
export type HybridVectorResult = {
|
||||
@ -21,13 +23,30 @@ export type HybridKeywordResult = {
|
||||
};
|
||||
|
||||
export function buildFtsQuery(raw: string): string | null {
|
||||
const tokens =
|
||||
raw
|
||||
.match(/[A-Za-z0-9_]+/g)
|
||||
?.map((t) => t.trim())
|
||||
.filter(Boolean) ?? [];
|
||||
if (!raw || typeof raw !== "string") return null;
|
||||
const trimmed = raw.trim();
|
||||
if (!trimmed) return null;
|
||||
|
||||
let tokens: string[];
|
||||
|
||||
if (hasChinese(trimmed)) {
|
||||
// 中文文本使用中文分词
|
||||
tokens = tokenizeMixed(trimmed);
|
||||
} else {
|
||||
// 英文/数字文本使用原有的正则提取
|
||||
tokens =
|
||||
trimmed
|
||||
.match(/[A-Za-z0-9_]+/g)
|
||||
?.map((t) => t.trim())
|
||||
.filter(Boolean) ?? [];
|
||||
}
|
||||
|
||||
if (tokens.length === 0) return null;
|
||||
const quoted = tokens.map((t) => `"${t.replaceAll('"', "")}"`);
|
||||
|
||||
// 去重并保持顺序
|
||||
const uniqueTokens = [...new Set(tokens)];
|
||||
|
||||
const quoted = uniqueTokens.map((t) => `"${t.replaceAll('"', "")}"`);
|
||||
return quoted.join(" AND ");
|
||||
}
|
||||
|
||||
|
||||
@ -3,6 +3,8 @@ import fsSync from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
|
||||
import { tokenizeMixed, hasChinese } from "./tokenizer-zh.js";
|
||||
|
||||
export type MemoryFileEntry = {
|
||||
path: string;
|
||||
absPath: string;
|
||||
@ -239,3 +241,27 @@ export function cosineSimilarity(a: number[], b: number[]): number {
|
||||
if (normA === 0 || normB === 0) return 0;
|
||||
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
||||
}
|
||||
|
||||
/**
|
||||
* 预处理文本用于 FTS 索引
|
||||
* 对中文文本进行分词,对英文/数字保持原样
|
||||
*/
|
||||
export function preprocessTextForFts(text: string): string {
|
||||
if (!text || typeof text !== "string") {
|
||||
return "";
|
||||
}
|
||||
|
||||
// 如果不包含中文,直接返回原文本
|
||||
if (!hasChinese(text)) {
|
||||
return text;
|
||||
}
|
||||
|
||||
// 对混合文本进行分词
|
||||
const tokens = tokenizeMixed(text);
|
||||
if (tokens.length === 0) {
|
||||
return text;
|
||||
}
|
||||
|
||||
// 返回分词后的文本,用空格连接
|
||||
return tokens.join(" ");
|
||||
}
|
||||
|
||||
@ -37,6 +37,7 @@ import {
|
||||
isMemoryPath,
|
||||
listMemoryFiles,
|
||||
normalizeExtraMemoryPaths,
|
||||
preprocessTextForFts,
|
||||
type MemoryChunk,
|
||||
type MemoryFileEntry,
|
||||
parseEmbedding,
|
||||
@ -2202,13 +2203,15 @@ export class MemoryIndexManager {
|
||||
.run(id, vectorToBlob(embedding));
|
||||
}
|
||||
if (this.fts.enabled && this.fts.available) {
|
||||
// 预处理中文文本用于 FTS 索引
|
||||
const ftsText = preprocessTextForFts(chunk.text);
|
||||
this.db
|
||||
.prepare(
|
||||
`INSERT INTO ${FTS_TABLE} (text, id, path, source, model, start_line, end_line)\n` +
|
||||
` VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
||||
)
|
||||
.run(
|
||||
chunk.text,
|
||||
ftsText,
|
||||
id,
|
||||
entry.path,
|
||||
options.source,
|
||||
|
||||
67
src/memory/tokenizer-zh.test.ts
Normal file
67
src/memory/tokenizer-zh.test.ts
Normal file
@ -0,0 +1,67 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { hasChinese, tokenizeMixed } from "./tokenizer-zh.js";
|
||||
|
||||
describe("Chinese tokenizer", () => {
|
||||
describe("hasChinese", () => {
|
||||
it("detects Chinese characters", () => {
|
||||
expect(hasChinese("你好")).toBe(true);
|
||||
expect(hasChinese("世界")).toBe(true);
|
||||
expect(hasChinese("你好世界")).toBe(true);
|
||||
expect(hasChinese("中文测试")).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false for pure English text", () => {
|
||||
expect(hasChinese("hello world")).toBe(false);
|
||||
expect(hasChinese("test")).toBe(false);
|
||||
expect(hasChinese("")).toBe(false);
|
||||
});
|
||||
|
||||
it("returns true for mixed Chinese and English", () => {
|
||||
expect(hasChinese("你好 hello")).toBe(true);
|
||||
expect(hasChinese("hello 世界")).toBe(true);
|
||||
});
|
||||
|
||||
it("handles empty and invalid input", () => {
|
||||
expect(hasChinese("")).toBe(false);
|
||||
expect(hasChinese(null as unknown as string)).toBe(false);
|
||||
expect(hasChinese(undefined as unknown as string)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("tokenizeMixed", () => {
|
||||
it("tokenizes pure Chinese text", () => {
|
||||
const result = tokenizeMixed("你好世界");
|
||||
expect(result.length).toBeGreaterThan(1);
|
||||
expect(result).toContain("你好");
|
||||
expect(result).toContain("世界");
|
||||
});
|
||||
|
||||
it("tokenizes Chinese sentence", () => {
|
||||
const result = tokenizeMixed("今天天气很好");
|
||||
expect(result.length).toBeGreaterThan(0);
|
||||
// "今天" 和 "天气" 应该被分出来
|
||||
expect(result.some((t) => t.includes("今天") || t.includes("天"))).toBe(true);
|
||||
});
|
||||
|
||||
it("extracts English tokens from pure English text", () => {
|
||||
const result = tokenizeMixed("hello world test");
|
||||
expect(result).toContain("hello");
|
||||
expect(result).toContain("world");
|
||||
expect(result).toContain("test");
|
||||
});
|
||||
|
||||
it("tokenizes mixed Chinese and English text", () => {
|
||||
const result = tokenizeMixed("你好 hello world 世界");
|
||||
expect(result.length).toBeGreaterThan(2);
|
||||
expect(result).toContain("你好");
|
||||
expect(result).toContain("hello");
|
||||
expect(result).toContain("world");
|
||||
expect(result).toContain("世界");
|
||||
});
|
||||
|
||||
it("handles empty input", () => {
|
||||
expect(tokenizeMixed("")).toEqual([]);
|
||||
expect(tokenizeMixed(null as unknown as string)).toEqual([]);
|
||||
});
|
||||
});
|
||||
});
|
||||
61
src/memory/tokenizer-zh.ts
Normal file
61
src/memory/tokenizer-zh.ts
Normal file
@ -0,0 +1,61 @@
|
||||
import { cut, load } from "@node-rs/jieba";
|
||||
|
||||
// 初始化 jieba 词典(在模块加载时自动加载)
|
||||
load();
|
||||
|
||||
// 标点符号正则表达式(只包含英文和中文标点,不包含中文字符)
|
||||
const punctuationPattern =
|
||||
/^[\u0020-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u007e\u3000-\u303f\ufe10-\ufe1f\ufe30-\ufe44\ufe50-\ufe6b\uff00-\uff60\uffe0-\uffe6]+$/;
|
||||
|
||||
/**
|
||||
* 检测文本是否包含中文字符
|
||||
*/
|
||||
export function hasChinese(text: string): boolean {
|
||||
if (!text || typeof text !== "string") return false;
|
||||
// 检测中文字符范围
|
||||
return /[\u4e00-\u9fa5]/.test(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* 对混合文本进行分词
|
||||
* 中文使用 jieba 分词,英文和数字保持原样
|
||||
*/
|
||||
export function tokenizeMixed(text: string): string[] {
|
||||
if (!text || typeof text !== "string") return [];
|
||||
|
||||
// 如果不包含中文,只提取英文和数字
|
||||
if (!hasChinese(text)) {
|
||||
return (
|
||||
text
|
||||
.match(/[A-Za-z0-9_]+/g)
|
||||
?.map((t) => t.trim())
|
||||
.filter(Boolean) ?? []
|
||||
);
|
||||
}
|
||||
|
||||
// 使用 jieba 对整段文本进行分词,hmm=false 避免将数字英文拆开
|
||||
const tokens = cut(text, false);
|
||||
|
||||
// jieba 分词结果可能包含标点符号,我们只保留有意义的 token
|
||||
const result: string[] = [];
|
||||
for (const token of tokens) {
|
||||
// 跳过空字符串
|
||||
if (!token || token.length === 0) continue;
|
||||
|
||||
// 去除 token 中的前后空白
|
||||
const trimmed = token.trim();
|
||||
|
||||
// 过滤掉纯标点符号和空白 token
|
||||
if (trimmed.length === 0) continue;
|
||||
|
||||
// 跳过纯标点符号(不包括中文字符)
|
||||
if (punctuationPattern.test(trimmed)) continue;
|
||||
|
||||
result.push(trimmed);
|
||||
}
|
||||
|
||||
// 去重
|
||||
const uniqueTokens = [...new Set(result)];
|
||||
|
||||
return uniqueTokens;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user