From 02b1413bf5922407a7cb8df3e9334af13d82344e Mon Sep 17 00:00:00 2001 From: Jay Liu Date: Fri, 30 Jan 2026 00:11:44 +1300 Subject: [PATCH] feat(memory): add Chinese tokenization support - Add @node-rs/jieba for high-performance Chinese word segmentation - Create tokenizer-zh.ts with hasChinese() and tokenizeMixed() functions - Update buildFtsQuery() to handle mixed Chinese/English text - Add preprocessTextForFts() for FTS indexing with Chinese text - Update manager.ts to use preprocessing when writing to FTS table - Add comprehensive tests for tokenizer and hybrid functions --- package.json | 1 + pnpm-lock.yaml | 251 +++++++++++++++++++++----------- src/memory/hybrid.test.ts | 27 ++++ src/memory/hybrid.ts | 31 +++- src/memory/internal.ts | 26 ++++ src/memory/manager.ts | 5 +- src/memory/tokenizer-zh.test.ts | 67 +++++++++ src/memory/tokenizer-zh.ts | 61 ++++++++ 8 files changed, 376 insertions(+), 93 deletions(-) create mode 100644 src/memory/tokenizer-zh.test.ts create mode 100644 src/memory/tokenizer-zh.ts diff --git a/package.json b/package.json index 04322f3af..6cb8a51ab 100644 --- a/package.json +++ b/package.json @@ -168,6 +168,7 @@ "@mariozechner/pi-coding-agent": "0.49.3", "@mariozechner/pi-tui": "0.49.3", "@mozilla/readability": "^0.6.0", + "@node-rs/jieba": "^1.10.4", "@sinclair/typebox": "0.34.47", "@slack/bolt": "^4.6.0", "@slack/web-api": "^7.13.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9c0f99928..0cf55d9f0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -55,6 +55,9 @@ importers: '@mozilla/readability': specifier: ^0.6.0 version: 0.6.0 + '@node-rs/jieba': + specifier: ^1.10.4 + version: 1.10.4 '@sinclair/typebox': specifier: 0.34.47 version: 0.34.47 @@ -383,12 +386,12 @@ importers: '@microsoft/agents-hosting-extensions-teams': specifier: ^1.2.2 version: 1.2.2 - moltbot: - specifier: workspace:* - version: link:../.. express: specifier: ^5.2.1 version: 5.2.1 + moltbot: + specifier: workspace:* + version: link:../.. proper-lockfile: specifier: ^4.1.2 version: 4.1.2 @@ -1557,6 +1560,9 @@ packages: resolution: {integrity: sha512-/p08f93LEbsL5mDZFQ3DBxcPv/I4QG9EDYRRq1WNlCOXVfAHBTHMSVMwxlqG/AtnSfUr9+vgfN7MKiyDo0+Weg==} engines: {node: '>= 10'} + '@napi-rs/wasm-runtime@0.2.12': + resolution: {integrity: sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==} + '@napi-rs/wasm-runtime@1.1.1': resolution: {integrity: sha512-p64ah1M1ld8xjWv3qbvFwHiFVWrq1yFvV4f7w+mzaqiR4IlSgkqhcRdHwsGgomwzBH51sRY4NEowLxnaBjcW/A==} @@ -1658,6 +1664,93 @@ packages: cpu: [x64] os: [win32] + '@node-rs/jieba-android-arm-eabi@1.10.4': + resolution: {integrity: sha512-MhyvW5N3Fwcp385d0rxbCWH42kqDBatQTyP8XbnYbju2+0BO/eTeCCLYj7Agws4pwxn2LtdldXRSKavT7WdzNA==} + engines: {node: '>= 10'} + cpu: [arm] + os: [android] + + '@node-rs/jieba-android-arm64@1.10.4': + resolution: {integrity: sha512-XyDwq5+rQ+Tk55A+FGi6PtJbzf974oqnpyCcCPzwU3QVXJCa2Rr4Lci+fx8oOpU4plT3GuD+chXMYLsXipMgJA==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [android] + + '@node-rs/jieba-darwin-arm64@1.10.4': + resolution: {integrity: sha512-G++RYEJ2jo0rxF9626KUy90wp06TRUjAsvY/BrIzEOX/ingQYV/HjwQzNPRR1P1o32a6/U8RGo7zEBhfdybL6w==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [darwin] + + '@node-rs/jieba-darwin-x64@1.10.4': + resolution: {integrity: sha512-MmDNeOb2TXIZCPyWCi2upQnZpPjAxw5ZGEj6R8kNsPXVFALHIKMa6ZZ15LCOkSTsKXVC17j2t4h+hSuyYb6qfQ==} + engines: {node: '>= 10'} + cpu: [x64] + os: [darwin] + + '@node-rs/jieba-freebsd-x64@1.10.4': + resolution: {integrity: sha512-/x7aVQ8nqUWhpXU92RZqd333cq639i/olNpd9Z5hdlyyV5/B65LLy+Je2B2bfs62PVVm5QXRpeBcZqaHelp/bg==} + engines: {node: '>= 10'} + cpu: [x64] + os: [freebsd] + + '@node-rs/jieba-linux-arm-gnueabihf@1.10.4': + resolution: {integrity: sha512-crd2M35oJBRLkoESs0O6QO3BBbhpv+tqXuKsqhIG94B1d02RVxtRIvSDwO33QurxqSdvN9IeSnVpHbDGkuXm3g==} + engines: {node: '>= 10'} + cpu: [arm] + os: [linux] + + '@node-rs/jieba-linux-arm64-gnu@1.10.4': + resolution: {integrity: sha512-omIzNX1psUzPcsdnUhGU6oHeOaTCuCjUgOA/v/DGkvWC1jLcnfXe4vdYbtXMh4XOCuIgS1UCcvZEc8vQLXFbXQ==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + + '@node-rs/jieba-linux-arm64-musl@1.10.4': + resolution: {integrity: sha512-Y/tiJ1+HeS5nnmLbZOE+66LbsPOHZ/PUckAYVeLlQfpygLEpLYdlh0aPpS5uiaWMjAXYZYdFkpZHhxDmSLpwpw==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + + '@node-rs/jieba-linux-x64-gnu@1.10.4': + resolution: {integrity: sha512-WZO8ykRJpWGE9MHuZpy1lu3nJluPoeB+fIJJn5CWZ9YTVhNDWoCF4i/7nxz1ntulINYGQ8VVuCU9LD86Mek97g==} + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + + '@node-rs/jieba-linux-x64-musl@1.10.4': + resolution: {integrity: sha512-uBBD4S1rGKcgCyAk6VCKatEVQb6EDD5I40v/DxODi5CuZVCANi9m5oee/MQbAoaX7RydA2f0OSCE9/tcwXEwUg==} + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + + '@node-rs/jieba-wasm32-wasi@1.10.4': + resolution: {integrity: sha512-Y2umiKHjuIJy0uulNDz9SDYHdfq5Hmy7jY5nORO99B4pySKkcrMjpeVrmWXJLIsEKLJwcCXHxz8tjwU5/uhz0A==} + engines: {node: '>=14.0.0'} + cpu: [wasm32] + + '@node-rs/jieba-win32-arm64-msvc@1.10.4': + resolution: {integrity: sha512-nwMtViFm4hjqhz1it/juQnxpXgqlGltCuWJ02bw70YUDMDlbyTy3grCJPpQQpueeETcALUnTxda8pZuVrLRcBA==} + engines: {node: '>= 10'} + cpu: [arm64] + os: [win32] + + '@node-rs/jieba-win32-ia32-msvc@1.10.4': + resolution: {integrity: sha512-DCAvLx7Z+W4z5oKS+7vUowAJr0uw9JBw8x1Y23Xs/xMA4Em+OOSiaF5/tCJqZUCJ8uC4QeImmgDFiBqGNwxlyA==} + engines: {node: '>= 10'} + cpu: [ia32] + os: [win32] + + '@node-rs/jieba-win32-x64-msvc@1.10.4': + resolution: {integrity: sha512-+sqemSfS1jjb+Tt7InNbNzrRh1Ua3vProVvC4BZRPg010/leCbGFFiQHpzcPRfpxAXZrzG5Y0YBTsPzN/I4yHQ==} + engines: {node: '>= 10'} + cpu: [x64] + os: [win32] + + '@node-rs/jieba@1.10.4': + resolution: {integrity: sha512-GvDgi8MnBiyWd6tksojej8anIx18244NmIOc1ovEw8WKNUejcccLfyu8vj66LWSuoZuKILVtNsOy4jvg3aoxIw==} + engines: {node: '>= 10'} + '@nodelib/fs.scandir@2.1.5': resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==} engines: {node: '>= 8'} @@ -3214,11 +3307,6 @@ packages: class-variance-authority@0.7.1: resolution: {integrity: sha512-Ka+9Trutv7G8M6WT6SeiRWz792K5qEqIGEGzXKhAE6xOWAY6pPH8U+9IY3oCMv6kqTmLsv7Xh/2w2RigkePMsg==} - clawdbot@2026.1.24-3: - resolution: {integrity: sha512-zt9BzhWXduq8ZZR4rfzQDurQWAgmijTTyPZCQGrn5ew6wCEwhxxEr2/NHG7IlCwcfRsKymsY4se9KMhoNz0JtQ==} - engines: {node: '>=22.12.0'} - hasBin: true - cli-cursor@5.0.0: resolution: {integrity: sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw==} engines: {node: '>=18'} @@ -5193,6 +5281,7 @@ packages: tar@7.5.4: resolution: {integrity: sha512-AN04xbWGrSTDmVwlI4/GTlIIwMFk/XEv7uL8aa57zuvRy6s4hdBed+lVq2fAZ89XDa7Us3ANXcE3Tvqvja1kTA==} engines: {node: '>=18'} + deprecated: Old versions of tar are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exhorbitant rates) by contacting i@izs.me thenify-all@1.6.0: resolution: {integrity: sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==} @@ -7160,6 +7249,13 @@ snapshots: '@napi-rs/canvas-win32-x64-msvc': 0.1.88 optional: true + '@napi-rs/wasm-runtime@0.2.12': + dependencies: + '@emnapi/core': 1.8.1 + '@emnapi/runtime': 1.8.1 + '@tybys/wasm-util': 0.10.1 + optional: true + '@napi-rs/wasm-runtime@1.1.1': dependencies: '@emnapi/core': 1.8.1 @@ -7222,6 +7318,67 @@ snapshots: '@node-llama-cpp/win-x64@3.15.0': optional: true + '@node-rs/jieba-android-arm-eabi@1.10.4': + optional: true + + '@node-rs/jieba-android-arm64@1.10.4': + optional: true + + '@node-rs/jieba-darwin-arm64@1.10.4': + optional: true + + '@node-rs/jieba-darwin-x64@1.10.4': + optional: true + + '@node-rs/jieba-freebsd-x64@1.10.4': + optional: true + + '@node-rs/jieba-linux-arm-gnueabihf@1.10.4': + optional: true + + '@node-rs/jieba-linux-arm64-gnu@1.10.4': + optional: true + + '@node-rs/jieba-linux-arm64-musl@1.10.4': + optional: true + + '@node-rs/jieba-linux-x64-gnu@1.10.4': + optional: true + + '@node-rs/jieba-linux-x64-musl@1.10.4': + optional: true + + '@node-rs/jieba-wasm32-wasi@1.10.4': + dependencies: + '@napi-rs/wasm-runtime': 0.2.12 + optional: true + + '@node-rs/jieba-win32-arm64-msvc@1.10.4': + optional: true + + '@node-rs/jieba-win32-ia32-msvc@1.10.4': + optional: true + + '@node-rs/jieba-win32-x64-msvc@1.10.4': + optional: true + + '@node-rs/jieba@1.10.4': + optionalDependencies: + '@node-rs/jieba-android-arm-eabi': 1.10.4 + '@node-rs/jieba-android-arm64': 1.10.4 + '@node-rs/jieba-darwin-arm64': 1.10.4 + '@node-rs/jieba-darwin-x64': 1.10.4 + '@node-rs/jieba-freebsd-x64': 1.10.4 + '@node-rs/jieba-linux-arm-gnueabihf': 1.10.4 + '@node-rs/jieba-linux-arm64-gnu': 1.10.4 + '@node-rs/jieba-linux-arm64-musl': 1.10.4 + '@node-rs/jieba-linux-x64-gnu': 1.10.4 + '@node-rs/jieba-linux-x64-musl': 1.10.4 + '@node-rs/jieba-wasm32-wasi': 1.10.4 + '@node-rs/jieba-win32-arm64-msvc': 1.10.4 + '@node-rs/jieba-win32-ia32-msvc': 1.10.4 + '@node-rs/jieba-win32-x64-msvc': 1.10.4 + '@nodelib/fs.scandir@2.1.5': dependencies: '@nodelib/fs.stat': 2.0.5 @@ -9098,84 +9255,6 @@ snapshots: dependencies: clsx: 2.1.1 - clawdbot@2026.1.24-3(@types/express@5.0.6)(audio-decode@2.2.3)(devtools-protocol@0.0.1561482)(typescript@5.9.3): - dependencies: - '@agentclientprotocol/sdk': 0.13.1(zod@4.3.6) - '@aws-sdk/client-bedrock': 3.975.0 - '@buape/carbon': 0.14.0(hono@4.11.4) - '@clack/prompts': 0.11.0 - '@grammyjs/runner': 2.0.3(grammy@1.39.3) - '@grammyjs/transformer-throttler': 1.2.1(grammy@1.39.3) - '@homebridge/ciao': 1.3.4 - '@line/bot-sdk': 10.6.0 - '@lydell/node-pty': 1.2.0-beta.3 - '@mariozechner/pi-agent-core': 0.49.3(ws@8.19.0)(zod@4.3.6) - '@mariozechner/pi-ai': 0.49.3(ws@8.19.0)(zod@4.3.6) - '@mariozechner/pi-coding-agent': 0.49.3(ws@8.19.0)(zod@4.3.6) - '@mariozechner/pi-tui': 0.49.3 - '@mozilla/readability': 0.6.0 - '@sinclair/typebox': 0.34.47 - '@slack/bolt': 4.6.0(@types/express@5.0.6) - '@slack/web-api': 7.13.0 - '@whiskeysockets/baileys': 7.0.0-rc.9(audio-decode@2.2.3)(sharp@0.34.5) - ajv: 8.17.1 - body-parser: 2.2.2 - chalk: 5.6.2 - chokidar: 5.0.0 - chromium-bidi: 13.0.1(devtools-protocol@0.0.1561482) - cli-highlight: 2.1.11 - commander: 14.0.2 - croner: 9.1.0 - detect-libc: 2.1.2 - discord-api-types: 0.38.37 - dotenv: 17.2.3 - express: 5.2.1 - file-type: 21.3.0 - grammy: 1.39.3 - hono: 4.11.4 - jiti: 2.6.1 - json5: 2.2.3 - jszip: 3.10.1 - linkedom: 0.18.12 - long: 5.3.2 - markdown-it: 14.1.0 - node-edge-tts: 1.2.9 - osc-progress: 0.3.0 - pdfjs-dist: 5.4.530 - playwright-core: 1.58.0 - proper-lockfile: 4.1.2 - qrcode-terminal: 0.12.0 - sharp: 0.34.5 - sqlite-vec: 0.1.7-alpha.2 - tar: 7.5.4 - tslog: 4.10.2 - undici: 7.19.0 - ws: 8.19.0 - yaml: 2.8.2 - zod: 4.3.6 - optionalDependencies: - '@napi-rs/canvas': 0.1.88 - node-llama-cpp: 3.15.0(typescript@5.9.3) - transitivePeerDependencies: - - '@discordjs/opus' - - '@modelcontextprotocol/sdk' - - '@types/express' - - audio-decode - - aws-crt - - bufferutil - - canvas - - debug - - devtools-protocol - - encoding - - ffmpeg-static - - jimp - - link-preview-js - - node-opus - - opusscript - - supports-color - - typescript - - utf-8-validate - cli-cursor@5.0.0: dependencies: restore-cursor: 5.1.0 diff --git a/src/memory/hybrid.test.ts b/src/memory/hybrid.test.ts index 294dc9950..1d4a99274 100644 --- a/src/memory/hybrid.test.ts +++ b/src/memory/hybrid.test.ts @@ -9,6 +9,33 @@ describe("memory hybrid helpers", () => { expect(buildFtsQuery(" ")).toBeNull(); }); + it("buildFtsQuery handles Chinese text with tokenization", () => { + // 纯中文测试 - 分词后应有多个 token + const result = buildFtsQuery("你好世界"); + // 分词结果不应为 null + expect(result).not.toBeNull(); + // 如果有中文分词结果,应该包含 AND + if (result) { + expect(result).toContain("AND"); + } + }); + + it("buildFtsQuery handles mixed Chinese and English text", () => { + const mixedResult = buildFtsQuery("你好 hello world 世界"); + expect(mixedResult).not.toBeNull(); + expect(mixedResult).toContain("AND"); + expect(mixedResult).toContain('"hello"'); + expect(mixedResult).toContain('"world"'); + }); + + it("buildFtsQuery deduplicates tokens", () => { + const result = buildFtsQuery("hello hello world"); + expect(result).not.toBeNull(); + // "hello" should only appear once + const helloCount = (result!.match(/"hello"/g) || []).length; + expect(helloCount).toBe(1); + }); + it("bm25RankToScore is monotonic and clamped", () => { expect(bm25RankToScore(0)).toBeCloseTo(1); expect(bm25RankToScore(1)).toBeCloseTo(0.5); diff --git a/src/memory/hybrid.ts b/src/memory/hybrid.ts index 753748bf9..b0be6d242 100644 --- a/src/memory/hybrid.ts +++ b/src/memory/hybrid.ts @@ -1,3 +1,5 @@ +import { hasChinese, tokenizeMixed } from "./tokenizer-zh.js"; + export type HybridSource = string; export type HybridVectorResult = { @@ -21,13 +23,30 @@ export type HybridKeywordResult = { }; export function buildFtsQuery(raw: string): string | null { - const tokens = - raw - .match(/[A-Za-z0-9_]+/g) - ?.map((t) => t.trim()) - .filter(Boolean) ?? []; + if (!raw || typeof raw !== "string") return null; + const trimmed = raw.trim(); + if (!trimmed) return null; + + let tokens: string[]; + + if (hasChinese(trimmed)) { + // 中文文本使用中文分词 + tokens = tokenizeMixed(trimmed); + } else { + // 英文/数字文本使用原有的正则提取 + tokens = + trimmed + .match(/[A-Za-z0-9_]+/g) + ?.map((t) => t.trim()) + .filter(Boolean) ?? []; + } + if (tokens.length === 0) return null; - const quoted = tokens.map((t) => `"${t.replaceAll('"', "")}"`); + + // 去重并保持顺序 + const uniqueTokens = [...new Set(tokens)]; + + const quoted = uniqueTokens.map((t) => `"${t.replaceAll('"', "")}"`); return quoted.join(" AND "); } diff --git a/src/memory/internal.ts b/src/memory/internal.ts index b2ab8c0a4..9a62ec713 100644 --- a/src/memory/internal.ts +++ b/src/memory/internal.ts @@ -3,6 +3,8 @@ import fsSync from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; +import { tokenizeMixed, hasChinese } from "./tokenizer-zh.js"; + export type MemoryFileEntry = { path: string; absPath: string; @@ -239,3 +241,27 @@ export function cosineSimilarity(a: number[], b: number[]): number { if (normA === 0 || normB === 0) return 0; return dot / (Math.sqrt(normA) * Math.sqrt(normB)); } + +/** + * 预处理文本用于 FTS 索引 + * 对中文文本进行分词,对英文/数字保持原样 + */ +export function preprocessTextForFts(text: string): string { + if (!text || typeof text !== "string") { + return ""; + } + + // 如果不包含中文,直接返回原文本 + if (!hasChinese(text)) { + return text; + } + + // 对混合文本进行分词 + const tokens = tokenizeMixed(text); + if (tokens.length === 0) { + return text; + } + + // 返回分词后的文本,用空格连接 + return tokens.join(" "); +} diff --git a/src/memory/manager.ts b/src/memory/manager.ts index a799a5e0f..a7633b65e 100644 --- a/src/memory/manager.ts +++ b/src/memory/manager.ts @@ -37,6 +37,7 @@ import { isMemoryPath, listMemoryFiles, normalizeExtraMemoryPaths, + preprocessTextForFts, type MemoryChunk, type MemoryFileEntry, parseEmbedding, @@ -2202,13 +2203,15 @@ export class MemoryIndexManager { .run(id, vectorToBlob(embedding)); } if (this.fts.enabled && this.fts.available) { + // 预处理中文文本用于 FTS 索引 + const ftsText = preprocessTextForFts(chunk.text); this.db .prepare( `INSERT INTO ${FTS_TABLE} (text, id, path, source, model, start_line, end_line)\n` + ` VALUES (?, ?, ?, ?, ?, ?, ?)`, ) .run( - chunk.text, + ftsText, id, entry.path, options.source, diff --git a/src/memory/tokenizer-zh.test.ts b/src/memory/tokenizer-zh.test.ts new file mode 100644 index 000000000..c66c7f999 --- /dev/null +++ b/src/memory/tokenizer-zh.test.ts @@ -0,0 +1,67 @@ +import { describe, it, expect } from "vitest"; +import { hasChinese, tokenizeMixed } from "./tokenizer-zh.js"; + +describe("Chinese tokenizer", () => { + describe("hasChinese", () => { + it("detects Chinese characters", () => { + expect(hasChinese("你好")).toBe(true); + expect(hasChinese("世界")).toBe(true); + expect(hasChinese("你好世界")).toBe(true); + expect(hasChinese("中文测试")).toBe(true); + }); + + it("returns false for pure English text", () => { + expect(hasChinese("hello world")).toBe(false); + expect(hasChinese("test")).toBe(false); + expect(hasChinese("")).toBe(false); + }); + + it("returns true for mixed Chinese and English", () => { + expect(hasChinese("你好 hello")).toBe(true); + expect(hasChinese("hello 世界")).toBe(true); + }); + + it("handles empty and invalid input", () => { + expect(hasChinese("")).toBe(false); + expect(hasChinese(null as unknown as string)).toBe(false); + expect(hasChinese(undefined as unknown as string)).toBe(false); + }); + }); + + describe("tokenizeMixed", () => { + it("tokenizes pure Chinese text", () => { + const result = tokenizeMixed("你好世界"); + expect(result.length).toBeGreaterThan(1); + expect(result).toContain("你好"); + expect(result).toContain("世界"); + }); + + it("tokenizes Chinese sentence", () => { + const result = tokenizeMixed("今天天气很好"); + expect(result.length).toBeGreaterThan(0); + // "今天" 和 "天气" 应该被分出来 + expect(result.some((t) => t.includes("今天") || t.includes("天"))).toBe(true); + }); + + it("extracts English tokens from pure English text", () => { + const result = tokenizeMixed("hello world test"); + expect(result).toContain("hello"); + expect(result).toContain("world"); + expect(result).toContain("test"); + }); + + it("tokenizes mixed Chinese and English text", () => { + const result = tokenizeMixed("你好 hello world 世界"); + expect(result.length).toBeGreaterThan(2); + expect(result).toContain("你好"); + expect(result).toContain("hello"); + expect(result).toContain("world"); + expect(result).toContain("世界"); + }); + + it("handles empty input", () => { + expect(tokenizeMixed("")).toEqual([]); + expect(tokenizeMixed(null as unknown as string)).toEqual([]); + }); + }); +}); diff --git a/src/memory/tokenizer-zh.ts b/src/memory/tokenizer-zh.ts new file mode 100644 index 000000000..b1c25bc3e --- /dev/null +++ b/src/memory/tokenizer-zh.ts @@ -0,0 +1,61 @@ +import { cut, load } from "@node-rs/jieba"; + +// 初始化 jieba 词典(在模块加载时自动加载) +load(); + +// 标点符号正则表达式(只包含英文和中文标点,不包含中文字符) +const punctuationPattern = + /^[\u0020-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u007e\u3000-\u303f\ufe10-\ufe1f\ufe30-\ufe44\ufe50-\ufe6b\uff00-\uff60\uffe0-\uffe6]+$/; + +/** + * 检测文本是否包含中文字符 + */ +export function hasChinese(text: string): boolean { + if (!text || typeof text !== "string") return false; + // 检测中文字符范围 + return /[\u4e00-\u9fa5]/.test(text); +} + +/** + * 对混合文本进行分词 + * 中文使用 jieba 分词,英文和数字保持原样 + */ +export function tokenizeMixed(text: string): string[] { + if (!text || typeof text !== "string") return []; + + // 如果不包含中文,只提取英文和数字 + if (!hasChinese(text)) { + return ( + text + .match(/[A-Za-z0-9_]+/g) + ?.map((t) => t.trim()) + .filter(Boolean) ?? [] + ); + } + + // 使用 jieba 对整段文本进行分词,hmm=false 避免将数字英文拆开 + const tokens = cut(text, false); + + // jieba 分词结果可能包含标点符号,我们只保留有意义的 token + const result: string[] = []; + for (const token of tokens) { + // 跳过空字符串 + if (!token || token.length === 0) continue; + + // 去除 token 中的前后空白 + const trimmed = token.trim(); + + // 过滤掉纯标点符号和空白 token + if (trimmed.length === 0) continue; + + // 跳过纯标点符号(不包括中文字符) + if (punctuationPattern.test(trimmed)) continue; + + result.push(trimmed); + } + + // 去重 + const uniqueTokens = [...new Set(result)]; + + return uniqueTokens; +}