From b6efe41b8640b789dae0d3b84e27afbc0e8b9424 Mon Sep 17 00:00:00 2001 From: suyao Date: Wed, 3 Dec 2025 14:13:07 +0800 Subject: [PATCH] feat: add transliteration support for internationalization in tool names --- package.json | 1 + src/main/utils/__tests__/mcp.test.ts | 102 +++++++++++++++++++-------- src/main/utils/mcp.ts | 62 +++++++++++++--- yarn.lock | 13 ++++ 4 files changed, 138 insertions(+), 40 deletions(-) diff --git a/package.json b/package.json index 3f95aee6d5..74c2ccaea1 100644 --- a/package.json +++ b/package.json @@ -363,6 +363,7 @@ "tar": "^7.4.3", "tiny-pinyin": "^1.3.2", "tokenx": "^1.1.0", + "transliteration": "^2.3.5", "tsx": "^4.20.3", "turndown-plugin-gfm": "^1.0.2", "tw-animate-css": "^1.3.8", diff --git a/src/main/utils/__tests__/mcp.test.ts b/src/main/utils/__tests__/mcp.test.ts index 097dc8ba7c..dee5ab2a87 100644 --- a/src/main/utils/__tests__/mcp.test.ts +++ b/src/main/utils/__tests__/mcp.test.ts @@ -61,12 +61,13 @@ describe('buildFunctionCallToolName', () => { it('should replace invalid characters with underscores', () => { const result = buildFunctionCallToolName('test@server', 'tool#name') expect(result).not.toMatch(/[@#]/) - expect(result).toMatch(/^[\p{L}\p{N}_-]+$/u) + // Should only contain ASCII alphanumeric, underscore, dash, dot, colon + expect(result).toMatch(/^[a-zA-Z0-9_.\-:]+$/) }) it('should ensure name starts with a letter or underscore', () => { const result = buildFunctionCallToolName('123server', '456tool') - expect(result).toMatch(/^[\p{L}_]/u) + expect(result).toMatch(/^[a-zA-Z_]/) }) it('should handle consecutive underscores/dashes', () => { @@ -130,7 +131,7 @@ describe('buildFunctionCallToolName', () => { // Should still produce a valid unique suffix via fallback hash expect(result).toBeTruthy() expect(result.length).toBeLessThanOrEqual(63) - expect(result).toMatch(/^[\p{L}_][\p{L}\p{N}_-]*$/u) + expect(result).toMatch(/^[a-zA-Z_][a-zA-Z0-9_.\-:]*$/) // Should have a suffix (underscore followed by something) expect(result).toMatch(/_[a-z0-9]+$/) }) @@ -177,9 +178,9 @@ describe('buildFunctionCallToolName', () => { // Should be different expect(tool1).not.toBe(tool2) - // Both should be valid identifiers - expect(tool1).toMatch(/^[\p{L}_][\p{L}\p{N}_-]*$/u) - expect(tool2).toMatch(/^[\p{L}_][\p{L}\p{N}_-]*$/u) + // Both should be valid AI model tool names (ASCII only) + expect(tool1).toMatch(/^[a-zA-Z_][a-zA-Z0-9_.\-:]*$/) + expect(tool2).toMatch(/^[a-zA-Z_][a-zA-Z0-9_.\-:]*$/) // Both should be <= 63 chars expect(tool1.length).toBeLessThanOrEqual(63) @@ -194,12 +195,14 @@ describe('buildFunctionCallToolName', () => { }) }) - describe('internationalization support', () => { - it('should preserve Chinese characters in tool names', () => { + describe('internationalization support (CJK to ASCII transliteration)', () => { + it('should convert Chinese characters to pinyin', () => { const result = buildFunctionCallToolName('ocr', '行驶证OCR_轻盈版') - expect(result).toContain('行驶证') - expect(result).toContain('OCR') - expect(result).toContain('轻盈版') + // Chinese characters should be transliterated to pinyin + expect(result).not.toMatch(/[\u4e00-\u9fff]/) // No Chinese characters + expect(result).toContain('ocr') // OCR is lowercased + // Should only contain ASCII characters (lowercase) + expect(result).toMatch(/^[a-z_][a-z0-9_.\-:]*$/) }) it('should distinguish between different Chinese OCR tools', () => { @@ -210,47 +213,84 @@ describe('buildFunctionCallToolName', () => { buildFunctionCallToolName('ocr', '身份证OCR') ] - // All tools should be unique + // All tools should be unique (pinyin transliterations are different) const uniqueTools = new Set(tools) expect(uniqueTools.size).toBe(4) - // Verify each tool contains its distinctive Chinese characters - expect(tools[0]).toContain('行驶证') - expect(tools[1]).toContain('营业执照') - expect(tools[2]).toContain('车牌') - expect(tools[3]).toContain('身份证') + // All should be ASCII-only valid tool names + tools.forEach((tool) => { + expect(tool).toMatch(/^[a-zA-Z_][a-zA-Z0-9_.\-:]*$/) + expect(tool).not.toMatch(/[\u4e00-\u9fff]/) // No Chinese characters + }) + + // Verify they contain transliterated pinyin (with underscores between characters) + // 行驶证 = xing_shi_zheng, 营业执照 = ying_ye_zhi_zhao, 车牌 = che_pai, 身份证 = shen_fen_zheng + expect(tools[0]).toContain('xing_shi_zheng') + expect(tools[1]).toContain('ying_ye_zhi_zhao') + expect(tools[2]).toContain('che_pai') + expect(tools[3]).toContain('shen_fen_zheng') }) - it('should handle Japanese characters', () => { + it('should handle Japanese characters with base36 encoding', () => { const result = buildFunctionCallToolName('server', 'ユーザー検索') - expect(result).toContain('ユーザー検索') - expect(result).toMatch(/^[\p{L}_][\p{L}\p{N}_-]*$/u) + // Should be ASCII-only + expect(result).toMatch(/^[a-zA-Z_][a-zA-Z0-9_.\-:]*$/) + // Should not contain original Japanese characters + expect(result).not.toMatch(/[\u3040-\u309f\u30a0-\u30ff]/) }) - it('should handle Korean characters', () => { + it('should handle Korean characters with base36 encoding', () => { const result = buildFunctionCallToolName('server', '사용자검색') - expect(result).toContain('사용자검색') - expect(result).toMatch(/^[\p{L}_][\p{L}\p{N}_-]*$/u) + // Should be ASCII-only + expect(result).toMatch(/^[a-zA-Z_][a-zA-Z0-9_.\-:]*$/) + // Should not contain original Korean characters + expect(result).not.toMatch(/[\uac00-\ud7af]/) }) it('should handle mixed language tool names', () => { const result = buildFunctionCallToolName('api', 'search用户by名称') + // ASCII parts should be preserved (lowercased) expect(result).toContain('search') - expect(result).toContain('用户') expect(result).toContain('by') - expect(result).toContain('名称') - expect(result).toMatch(/^[\p{L}_][\p{L}\p{N}_-]*$/u) + // Chinese parts should be transliterated (用户 = yong_hu, 名称 = ming_cheng) + expect(result).toContain('yong_hu') + expect(result).toContain('ming_cheng') + // Final result should be ASCII-only (lowercase) + expect(result).toMatch(/^[a-z_][a-z0-9_.\-:]*$/) }) - it('should replace only control characters and special symbols, not Unicode letters', () => { + it('should transliterate Chinese and replace special symbols', () => { const result = buildFunctionCallToolName('test', '文件@上传#工具') // @ and # should be replaced with underscores expect(result).not.toContain('@') expect(result).not.toContain('#') - // Chinese characters should be preserved - expect(result).toContain('文件') - expect(result).toContain('上传') - expect(result).toContain('工具') + // Chinese characters should be transliterated + // 文件 = wen_jian, 上传 = shang_chuan, 工具 = gong_ju + expect(result).toContain('wen_jian') + expect(result).toContain('shang_chuan') + expect(result).toContain('gong_ju') + // Should be ASCII-only (lowercase) + expect(result).toMatch(/^[a-z_][a-z0-9_.\-:]*$/) + }) + + it('should produce AI model compatible tool names', () => { + const testCases = [ + '行驶证OCR', + '营业执照识别', + 'get用户info', + '文件@处理', + '数据分析_v2' + ] + + testCases.forEach((testCase) => { + const result = buildFunctionCallToolName('server', testCase) + // Must start with letter or underscore + expect(result).toMatch(/^[a-zA-Z_]/) + // Must only contain a-z, A-Z, 0-9, _, -, ., : + expect(result).toMatch(/^[a-zA-Z0-9_.\-:]+$/) + // Must be <= 64 characters + expect(result.length).toBeLessThanOrEqual(64) + }) }) }) }) diff --git a/src/main/utils/mcp.ts b/src/main/utils/mcp.ts index 31d0fe9f52..0a144f49f6 100644 --- a/src/main/utils/mcp.ts +++ b/src/main/utils/mcp.ts @@ -1,6 +1,51 @@ +import { transliterate } from 'transliteration' + +/** + * Transliterate non-ASCII characters to ASCII equivalents + * - Chinese → Pinyin (e.g., 行驶证 → xingshizheng) + * - Japanese → Romaji (e.g., ユーザー → yūzā) + * - Korean → Romanization (e.g., 사용자 → sayongja) + * - Other special characters → underscores + */ +/** + * Transliterates non-ASCII text (including CJK characters) to ASCII-compatible format. + * + * Converts input text to lowercase ASCII representation, replacing spaces with underscores + * and removing special characters. Unknown or special characters are replaced with underscores. + * + * @param text - The input string to transliterate, may contain Unicode characters including CJK + * @returns A lowercase ASCII string with spaces converted to underscores and special characters removed, + * preserving only alphanumeric characters, underscores, dots, hyphens, and colons + * + * @example + * ```typescript + * transliterateToAscii("Hello World") // returns "hello_world" + * transliterateToAscii("你好世界") // returns transliterated version with underscores + * transliterateToAscii("Café-123") // returns "cafe_123" + * ``` + */ +function transliterateToAscii(text: string): string { + // Use transliteration library which supports CJK (Chinese, Japanese, Korean) + const result = transliterate(text, { + // Unknown/special characters become underscores + unknown: '_', + ignore: [] + }) + + // Convert to lowercase, remove spaces, and clean up special chars + return result + .toLowerCase() + .replace(/\s+/g, '_') + .replace(/[^a-z0-9_.\-:]/g, '_') +} + export function buildFunctionCallToolName(serverName: string, toolName: string, serverId?: string) { - const sanitizedServer = serverName.trim().replace(/-/g, '_') - const sanitizedTool = toolName.trim().replace(/-/g, '_') + // First, transliterate non-ASCII characters to ASCII + const transliteratedServer = transliterateToAscii(serverName.trim()) + const transliteratedTool = transliterateToAscii(toolName.trim()) + + const sanitizedServer = transliteratedServer.replace(/-/g, '_') + const sanitizedTool = transliteratedTool.replace(/-/g, '_') // Calculate suffix first to reserve space for it // Suffix format: "_" + 6 alphanumeric chars = 7 chars total @@ -26,14 +71,13 @@ export function buildFunctionCallToolName(serverName: string, toolName: string, name = `${sanitizedServer.slice(0, 7) || ''}-${sanitizedTool || ''}` } - // Replace invalid characters with underscores or dashes - // Keep Unicode letters (\p{L}), Unicode numbers (\p{N}), underscores and dashes - // This supports international characters (Chinese, Japanese, Korean, etc.) - name = name.replace(/[^\p{L}\p{N}_-]/gu, '_') + // Replace invalid characters with underscores + // Keep only a-z, A-Z, 0-9, underscores, dashes, dots, colons (AI model compatible) + name = name.replace(/[^a-zA-Z0-9_.\-:]/g, '_') - // Ensure name starts with a letter or underscore (supports Unicode letters) - if (!/^[\p{L}_]/u.test(name)) { - name = `tool-${name}` + // Ensure name starts with a letter or underscore (AI model requirement) + if (!/^[a-zA-Z_]/.test(name)) { + name = `tool_${name}` } // Remove consecutive underscores/dashes (optional improvement) diff --git a/yarn.lock b/yarn.lock index 22b6c581db..da56898bae 100644 --- a/yarn.lock +++ b/yarn.lock @@ -10285,6 +10285,7 @@ __metadata: tesseract.js: "patch:tesseract.js@npm%3A6.0.1#~/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch" tiny-pinyin: "npm:^1.3.2" tokenx: "npm:^1.1.0" + transliteration: "npm:^2.3.5" tsx: "npm:^4.20.3" turndown: "npm:7.2.0" turndown-plugin-gfm: "npm:^1.0.2" @@ -24607,6 +24608,18 @@ __metadata: languageName: node linkType: hard +"transliteration@npm:^2.3.5": + version: 2.3.5 + resolution: "transliteration@npm:2.3.5" + dependencies: + yargs: "npm:^17.5.1" + bin: + slugify: dist/bin/slugify + transliterate: dist/bin/transliterate + checksum: 10c0/68397225c2ca59b8e33206c65f905724e86b64460cbf90576d352dc2366e763ded97e2c7b8b1f140fb36a565d61a97c51080df9fa638e6b1769f6cb24f383756 + languageName: node + linkType: hard + "tree-kill@npm:1.2.2, tree-kill@npm:^1.2.2": version: 1.2.2 resolution: "tree-kill@npm:1.2.2"