diff --git a/package.json b/package.json index a70663ffc..fcf0e531b 100644 --- a/package.json +++ b/package.json @@ -364,6 +364,7 @@ "tar": "^7.4.3", "tiny-pinyin": "^1.3.2", "tokenx": "^1.1.0", + "transliteration": "^2.3.5", "tsx": "^4.20.3", "turndown-plugin-gfm": "^1.0.2", "tw-animate-css": "^1.3.8", diff --git a/src/main/utils/__tests__/mcp.test.ts b/src/main/utils/__tests__/mcp.test.ts index b1a35f925..a5c6400f7 100644 --- a/src/main/utils/__tests__/mcp.test.ts +++ b/src/main/utils/__tests__/mcp.test.ts @@ -61,12 +61,13 @@ describe('buildFunctionCallToolName', () => { it('should replace invalid characters with underscores', () => { const result = buildFunctionCallToolName('test@server', 'tool#name') expect(result).not.toMatch(/[@#]/) - expect(result).toMatch(/^[a-zA-Z0-9_-]+$/) + // Should only contain ASCII alphanumeric, underscore, dash, dot, colon + expect(result).toMatch(/^[a-zA-Z0-9_.\-:]+$/) }) - it('should ensure name starts with a letter', () => { + it('should ensure name starts with a letter or underscore', () => { const result = buildFunctionCallToolName('123server', '456tool') - expect(result).toMatch(/^[a-zA-Z]/) + expect(result).toMatch(/^[a-zA-Z_]/) }) it('should handle consecutive underscores/dashes', () => { @@ -130,7 +131,7 @@ describe('buildFunctionCallToolName', () => { // Should still produce a valid unique suffix via fallback hash expect(result).toBeTruthy() expect(result.length).toBeLessThanOrEqual(63) - expect(result).toMatch(/^[a-zA-Z][a-zA-Z0-9_-]*$/) + expect(result).toMatch(/^[a-zA-Z_][a-zA-Z0-9_.\-:]*$/) // Should have a suffix (underscore followed by something) expect(result).toMatch(/_[a-z0-9]+$/) }) @@ -177,9 +178,9 @@ describe('buildFunctionCallToolName', () => { // Should be different expect(tool1).not.toBe(tool2) - // Both should be valid identifiers - expect(tool1).toMatch(/^[a-zA-Z][a-zA-Z0-9_-]*$/) - expect(tool2).toMatch(/^[a-zA-Z][a-zA-Z0-9_-]*$/) + // Both should be valid AI model tool names (ASCII only) + expect(tool1).toMatch(/^[a-zA-Z_][a-zA-Z0-9_.\-:]*$/) + expect(tool2).toMatch(/^[a-zA-Z_][a-zA-Z0-9_.\-:]*$/) // Both should be <= 63 chars expect(tool1.length).toBeLessThanOrEqual(63) @@ -193,4 +194,97 @@ describe('buildFunctionCallToolName', () => { expect(result.split('github').length - 1).toBeLessThanOrEqual(2) }) }) + + describe('internationalization support (CJK to ASCII transliteration)', () => { + it('should convert Chinese characters to pinyin', () => { + const result = buildFunctionCallToolName('ocr', '行驶证OCR_轻盈版') + // Chinese characters should be transliterated to pinyin + expect(result).not.toMatch(/[\u4e00-\u9fff]/) // No Chinese characters + expect(result).toContain('ocr') // OCR is lowercased + // Should only contain ASCII characters (lowercase) + expect(result).toMatch(/^[a-z_][a-z0-9_-]*$/) + }) + + it('should distinguish between different Chinese OCR tools', () => { + const tools = [ + buildFunctionCallToolName('ocr', '行驶证OCR_轻盈版'), + buildFunctionCallToolName('ocr', '营业执照OCR_轻盈版'), + buildFunctionCallToolName('ocr', '车牌OCR_轻盈版'), + buildFunctionCallToolName('ocr', '身份证OCR') + ] + + // All tools should be unique (pinyin transliterations are different) + const uniqueTools = new Set(tools) + expect(uniqueTools.size).toBe(4) + + // All should be ASCII-only valid tool names + tools.forEach((tool) => { + expect(tool).toMatch(/^[a-z_][a-z0-9_-]*$/) + expect(tool).not.toMatch(/[\u4e00-\u9fff]/) // No Chinese characters + }) + + // Verify they contain transliterated pinyin (with underscores between characters) + // 行驶证 = xing_shi_zheng, 营业执照 = ying_ye_zhi_zhao, 车牌 = che_pai, 身份证 = shen_fen_zheng + expect(tools[0]).toContain('xing_shi_zheng') + expect(tools[1]).toContain('ying_ye_zhi_zhao') + expect(tools[2]).toContain('che_pai') + expect(tools[3]).toContain('shen_fen_zheng') + }) + + it('should handle Japanese characters with Romaji transliteration', () => { + const result = buildFunctionCallToolName('server', 'ユーザー検索') + // Should be ASCII-only (Japanese characters are transliterated to Romaji) + expect(result).toMatch(/^[a-z_][a-z0-9_-]*$/) + // Should not contain original Japanese characters + expect(result).not.toMatch(/[\u3040-\u309f\u30a0-\u30ff]/) + }) + + it('should handle Korean characters with romanization', () => { + const result = buildFunctionCallToolName('server', '사용자검색') + // Should be ASCII-only + expect(result).toMatch(/^[a-z_][a-z0-9_-]*$/) + // Should not contain original Korean characters + expect(result).not.toMatch(/[\uac00-\ud7af]/) + }) + + it('should handle mixed language tool names', () => { + const result = buildFunctionCallToolName('api', 'search用户by名称') + // ASCII parts should be preserved (lowercased) + expect(result).toContain('search') + expect(result).toContain('by') + // Chinese parts should be transliterated (用户 = yong_hu, 名称 = ming_cheng) + expect(result).toContain('yong_hu') + expect(result).toContain('ming_cheng') + // Final result should be ASCII-only (lowercase) + expect(result).toMatch(/^[a-z_][a-z0-9_-]*$/) + }) + + it('should transliterate Chinese and replace special symbols', () => { + const result = buildFunctionCallToolName('test', '文件@上传#工具') + // @ and # should be replaced with underscores + expect(result).not.toContain('@') + expect(result).not.toContain('#') + // Chinese characters should be transliterated + // 文件 = wen_jian, 上传 = shang_chuan, 工具 = gong_ju + expect(result).toContain('wen_jian') + expect(result).toContain('shang_chuan') + expect(result).toContain('gong_ju') + // Should be ASCII-only (lowercase) + expect(result).toMatch(/^[a-z_][a-z0-9_-]*$/) + }) + + it('should produce AI model compatible tool names', () => { + const testCases = ['行驶证OCR', '营业执照识别', 'get用户info', '文件@处理', '数据分析_v2'] + + testCases.forEach((testCase) => { + const result = buildFunctionCallToolName('server', testCase) + // Must start with letter or underscore + expect(result).toMatch(/^[a-z_]/) + // Must only contain a-z, 0-9, _, - + expect(result).toMatch(/^[a-z0-9_-]+$/) + // Must be <= 64 characters + expect(result.length).toBeLessThanOrEqual(64) + }) + }) + }) }) diff --git a/src/main/utils/mcp.ts b/src/main/utils/mcp.ts index cfa700f2e..8f3260b5a 100644 --- a/src/main/utils/mcp.ts +++ b/src/main/utils/mcp.ts @@ -1,6 +1,72 @@ +import { loggerService } from '@logger' +import { transliterate } from 'transliteration' + +const logger = loggerService.withContext('Utils:MCP') + +/** + * Transliterates non-ASCII text (including CJK characters) to ASCII-compatible format. + * + * Converts input text to lowercase ASCII representation, replacing spaces with underscores + * and removing special characters. Unknown or special characters are replaced with underscores. + * + * @param text - The input string to transliterate, may contain Unicode characters including CJK + * @returns A lowercase ASCII string with spaces converted to underscores and special characters removed, + * preserving only alphanumeric characters, underscores, and hyphens + * + * @example + * ```typescript + * transliterateToAscii("Hello World") // returns "hello_world" + * transliterateToAscii("你好世界") // returns transliterated version with underscores + * transliterateToAscii("Café-123") // returns "cafe-123" + * ``` + */ +function transliterateToAscii(text: string): string { + // Input validation + if (!text || typeof text !== 'string') { + logger.warn('Invalid input to transliterateToAscii', { text }) + return 'invalid_input' + } + + try { + // Use transliteration library which supports CJK (Chinese, Japanese, Korean) + const result = transliterate(text, { + // Unknown/special characters become underscores + unknown: '_', + ignore: [] + }) + + logger.debug('Transliteration successful', { input: text, output: result }) + + // Convert to lowercase, remove spaces, and clean up special chars + // Only preserve a-z, 0-9, underscores, and hyphens (OpenAI/Anthropic API compatible) + return result + .toLowerCase() + .replace(/\s+/g, '_') + .replace(/[^a-z0-9_-]/g, '_') + } catch (error) { + logger.error('Transliteration failed, falling back to ASCII-only mode', { text, error }) + // Fallback: keep only ASCII alphanumeric, underscores, and hyphens for consistency + return text.toLowerCase().replace(/[^a-z0-9_-]/g, '_') + } +} + export function buildFunctionCallToolName(serverName: string, toolName: string, serverId?: string) { - const sanitizedServer = serverName.trim().replace(/-/g, '_') - const sanitizedTool = toolName.trim().replace(/-/g, '_') + // Input validation with descriptive fallbacks to indicate invalid input + if (!serverName || typeof serverName !== 'string') { + logger.warn('Invalid serverName provided', { serverName }) + serverName = 'invalid_server' + } + if (!toolName || typeof toolName !== 'string') { + logger.warn('Invalid toolName provided', { toolName }) + toolName = 'invalid_tool' + } + + // First, transliterate non-ASCII characters to ASCII + const transliteratedServer = transliterateToAscii(serverName.trim()) + const transliteratedTool = transliterateToAscii(toolName.trim()) + + const sanitizedServer = transliteratedServer.replace(/-/g, '_') + const sanitizedTool = transliteratedTool.replace(/-/g, '_') // Calculate suffix first to reserve space for it // Suffix format: "_" + 6 alphanumeric chars = 7 chars total @@ -26,13 +92,13 @@ export function buildFunctionCallToolName(serverName: string, toolName: string, name = `${sanitizedServer.slice(0, 7) || ''}-${sanitizedTool || ''}` } - // Replace invalid characters with underscores or dashes - // Keep a-z, A-Z, 0-9, underscores and dashes - name = name.replace(/[^a-zA-Z0-9_-]/g, '_') + // Replace invalid characters with underscores + // Keep only a-z, 0-9, underscores, dashes (OpenAI/Anthropic API compatible) + name = name.replace(/[^a-z0-9_-]/g, '_') - // Ensure name starts with a letter or underscore (for valid JavaScript identifier) - if (!/^[a-zA-Z]/.test(name)) { - name = `tool-${name}` + // Ensure name starts with a letter or underscore (AI model requirement) + if (!/^[a-z_]/.test(name)) { + name = `tool_${name}` } // Remove consecutive underscores/dashes (optional improvement) diff --git a/yarn.lock b/yarn.lock index d9d5ec1d6..751600ab4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -10336,6 +10336,7 @@ __metadata: tesseract.js: "patch:tesseract.js@npm%3A6.0.1#~/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch" tiny-pinyin: "npm:^1.3.2" tokenx: "npm:^1.1.0" + transliteration: "npm:^2.3.5" tsx: "npm:^4.20.3" turndown: "npm:7.2.0" turndown-plugin-gfm: "npm:^1.0.2" @@ -24737,6 +24738,18 @@ __metadata: languageName: node linkType: hard +"transliteration@npm:^2.3.5": + version: 2.3.5 + resolution: "transliteration@npm:2.3.5" + dependencies: + yargs: "npm:^17.5.1" + bin: + slugify: dist/bin/slugify + transliterate: dist/bin/transliterate + checksum: 10c0/68397225c2ca59b8e33206c65f905724e86b64460cbf90576d352dc2366e763ded97e2c7b8b1f140fb36a565d61a97c51080df9fa638e6b1769f6cb24f383756 + languageName: node + linkType: hard + "tree-kill@npm:1.2.2, tree-kill@npm:^1.2.2": version: 1.2.2 resolution: "tree-kill@npm:1.2.2"