From 83f36f5e77322ef34c55ae82f3624c3e0faf06d7 Mon Sep 17 00:00:00 2001 From: SuYao Date: Wed, 2 Jul 2025 03:03:03 +0800 Subject: [PATCH] refactor(WebSearchMiddleware, linkConverter): enhance link processing and buffering logic (#7724) - Updated WebSearchMiddleware to utilize the new smartLinkConverter structure, allowing for better handling of buffered content and fallback logic. - Introduced flushLinkConverterBuffer function to clear remaining buffered content at stream end. - Modified convertLinks and smartLinkConverter functions to return structured results indicating whether content was buffered. - Enhanced unit tests to cover new functionality and edge cases for link conversion and buffering behavior. --- .../middleware/core/WebSearchMiddleware.ts | 38 +++- .../src/utils/__tests__/linkConverter.test.ts | 184 +++++++++++++++++- src/renderer/src/utils/linkConverter.ts | 53 ++++- 3 files changed, 256 insertions(+), 19 deletions(-) diff --git a/src/renderer/src/aiCore/middleware/core/WebSearchMiddleware.ts b/src/renderer/src/aiCore/middleware/core/WebSearchMiddleware.ts index 97261e3d52..70915abffa 100644 --- a/src/renderer/src/aiCore/middleware/core/WebSearchMiddleware.ts +++ b/src/renderer/src/aiCore/middleware/core/WebSearchMiddleware.ts @@ -1,5 +1,5 @@ import { ChunkType } from '@renderer/types/chunk' -import { smartLinkConverter } from '@renderer/utils/linkConverter' +import { flushLinkConverterBuffer, smartLinkConverter } from '@renderer/utils/linkConverter' import { CompletionsParams, CompletionsResult, GenericChunk } from '../schemas' import { CompletionsContext, CompletionsMiddleware } from '../types' @@ -42,20 +42,46 @@ export const WebSearchMiddleware: CompletionsMiddleware = const providerType = model.provider || 'openai' // 使用当前可用的Web搜索结果进行链接转换 const text = chunk.text - const processedText = smartLinkConverter(text, providerType, isFirstChunk) + const result = smartLinkConverter(text, providerType, isFirstChunk) if (isFirstChunk) { isFirstChunk = false } - controller.enqueue({ - ...chunk, - text: processedText - }) + + // - 如果有内容被缓冲,说明convertLinks正在等待后续chunk,不使用原文本避免重复 + // - 如果没有内容被缓冲且结果为空,可能是其他处理问题,使用原文本作为安全回退 + let finalText: string + if (result.hasBufferedContent) { + // 有内容被缓冲,使用处理后的结果(可能为空,等待后续chunk) + finalText = result.text + } else { + // 没有内容被缓冲,可以安全使用回退逻辑 + finalText = result.text || text + } + + // 只有当finalText不为空时才发送chunk + if (finalText) { + controller.enqueue({ + ...chunk, + text: finalText + }) + } } else if (chunk.type === ChunkType.LLM_WEB_SEARCH_COMPLETE) { // 暂存Web搜索结果用于链接完善 ctx._internal.webSearchState!.results = chunk.llm_web_search // 将Web搜索完成事件继续传递下去 controller.enqueue(chunk) + } else if (chunk.type === ChunkType.LLM_RESPONSE_COMPLETE) { + // 流结束时,清空链接转换器的buffer并处理剩余内容 + const remainingText = flushLinkConverterBuffer() + if (remainingText) { + controller.enqueue({ + type: ChunkType.TEXT_DELTA, + text: remainingText + }) + } + // 继续传递LLM_RESPONSE_COMPLETE事件 + controller.enqueue(chunk) } else { controller.enqueue(chunk) } diff --git a/src/renderer/src/utils/__tests__/linkConverter.test.ts b/src/renderer/src/utils/__tests__/linkConverter.test.ts index eaecc3ca1f..eabca8e284 100644 --- a/src/renderer/src/utils/__tests__/linkConverter.test.ts +++ b/src/renderer/src/utils/__tests__/linkConverter.test.ts @@ -7,7 +7,8 @@ import { convertLinksToHunyuan, convertLinksToOpenRouter, convertLinksToZhipu, - extractUrlsFromMarkdown + extractUrlsFromMarkdown, + flushLinkConverterBuffer } from '../linkConverter' describe('linkConverter', () => { @@ -90,22 +91,197 @@ describe('linkConverter', () => { it('should convert links with domain-like text to numbered links', () => { const input = '查看这个网站 [example.com](https://example.com)' const result = convertLinks(input, true) - expect(result).toBe('查看这个网站 [1](https://example.com)') + expect(result.text).toBe('查看这个网站 [1](https://example.com)') + expect(result.hasBufferedContent).toBe(false) }) it('should handle parenthesized link format ([host](url))', () => { const input = '这里有链接 ([example.com](https://example.com))' const result = convertLinks(input, true) - expect(result).toBe('这里有链接 [1](https://example.com)') + expect(result.text).toBe('这里有链接 [1](https://example.com)') + expect(result.hasBufferedContent).toBe(false) }) it('should use the same counter for duplicate URLs', () => { const input = '第一个链接 [example.com](https://example.com) 和第二个相同链接 [subdomain.example.com](https://example.com)' const result = convertLinks(input, true) - expect(result).toBe( + expect(result.text).toBe( '第一个链接 [1](https://example.com) 和第二个相同链接 [1](https://example.com)' ) + expect(result.hasBufferedContent).toBe(false) + }) + + it('should not misinterpret code placeholders as incomplete links', () => { + const input = + 'The most common reason for a `404` error is that the repository specified in the `owner` and `repo`' + const result = convertLinks(input, true) + expect(result.text).toBe( + 'The most common reason for a `404` error is that the repository specified in the `owner` and `repo`' + ) + expect(result.hasBufferedContent).toBe(false) + }) + + it('should handle text with square brackets that are not links', () => { + const input = 'Use [owner] and [repo] placeholders in your configuration [file]' + const result = convertLinks(input, true) + expect(result.text).toBe('Use [owner] and [repo] placeholders in your configuration [file]') + expect(result.hasBufferedContent).toBe(false) + }) + + it('should handle markdown code blocks with square brackets', () => { + const input = 'In the code: `const config = { [key]: value }` you can see [brackets]' + const result = convertLinks(input, true) + expect(result.text).toBe('In the code: `const config = { [key]: value }` you can see [brackets]') + expect(result.hasBufferedContent).toBe(false) + }) + + it('should properly handle partial markdown link patterns', () => { + // 这种情况下,[text] 后面没有紧跟 (,所以不应该被当作潜在链接 + const input = 'Check the [documentation] for more details' + const result = convertLinks(input, true) + expect(result.text).toBe('Check the [documentation] for more details') + expect(result.hasBufferedContent).toBe(false) + }) + + it('should correctly identify and handle real incomplete links', () => { + // 第一个块包含真正的不完整链接模式 + const chunk1 = 'Visit [example.com](' + const result1 = convertLinks(chunk1, true) + expect(result1.text).toBe('Visit ') + expect(result1.hasBufferedContent).toBe(true) + + // 第二个块完成该链接 + const chunk2 = 'https://example.com) for more info' + const result2 = convertLinks(chunk2, false) + expect(result2.text).toBe('[1](https://example.com) for more info') + expect(result2.hasBufferedContent).toBe(false) + }) + + it('should handle mixed content with real links and placeholders', () => { + const input = 'Configure [owner] and [repo] in [GitHub](https://github.com) settings' + const result = convertLinks(input, true) + expect(result.text).toBe('Configure [owner] and [repo] in GitHub [1](https://github.com) settings') + expect(result.hasBufferedContent).toBe(false) + }) + + it('should handle empty text', () => { + const input = '' + const result = convertLinks(input, true) + expect(result.text).toBe('') + expect(result.hasBufferedContent).toBe(false) + }) + + it('should handle text with only square brackets', () => { + const input = '[][][]' + const result = convertLinks(input, true) + expect(result.text).toBe('[][][]') + expect(result.hasBufferedContent).toBe(false) + }) + + describe('streaming small chunks simulation', () => { + it('should handle non-link placeholders in small chunks without buffering', () => { + // 模拟用户遇到的问题:包含方括号占位符的文本被分成小chunks + const chunks = [ + 'The most common reason for a `404` error is that the repository specified in the `', + 'owner` and `', + 'repo` parameters are incorrect.' + ] + + let accumulatedText = '' + + // 第一个chunk + const result1 = convertLinks(chunks[0], true) + expect(result1.text).toBe(chunks[0]) // 应该立即返回,不缓冲 + expect(result1.hasBufferedContent).toBe(false) + accumulatedText += result1.text + + // 第二个chunk + const result2 = convertLinks(chunks[1], false) + expect(result2.text).toBe(chunks[1]) // 应该立即返回,不缓冲 + expect(result2.hasBufferedContent).toBe(false) + accumulatedText += result2.text + + // 第三个chunk + const result3 = convertLinks(chunks[2], false) + expect(result3.text).toBe(chunks[2]) // 应该立即返回,不缓冲 + expect(result3.hasBufferedContent).toBe(false) + accumulatedText += result3.text + + // 验证最终结果 + expect(accumulatedText).toBe(chunks.join('')) + expect(accumulatedText).toBe( + 'The most common reason for a `404` error is that the repository specified in the `owner` and `repo` parameters are incorrect.' + ) + }) + + it('should handle real links split across small chunks with proper buffering', () => { + // 模拟真实链接被分割成小chunks的情况 - 更现实的分割方式 + const chunks = [ + 'Please visit [example.com](', // 不完整链接 + 'https://example.com) for details' // 完成链接 + ] + + let accumulatedText = '' + + // 第一个chunk:包含不完整链接 [text]( + const result1 = convertLinks(chunks[0], true) + expect(result1.text).toBe('Please visit ') // 只返回安全部分 + expect(result1.hasBufferedContent).toBe(true) // [example.com]( 被缓冲 + accumulatedText += result1.text + + // 第二个chunk:完成链接 + const result2 = convertLinks(chunks[1], false) + expect(result2.text).toBe('[1](https://example.com) for details') // 完整链接 + 剩余文本 + expect(result2.hasBufferedContent).toBe(false) + accumulatedText += result2.text + + // 验证最终结果 + expect(accumulatedText).toBe('Please visit [1](https://example.com) for details') + }) + + it('should handle mixed content with placeholders and real links in small chunks', () => { + // 混合内容:既有占位符又有真实链接 - 更现实的分割方式 + const chunks = [ + 'Configure [owner] and [repo] in [GitHub](', // 占位符 + 不完整链接 + 'https://github.com) settings page.' // 完成链接 + ] + + let accumulatedText = '' + + // 第一个chunk:包含占位符和不完整链接 + const result1 = convertLinks(chunks[0], true) + expect(result1.text).toBe('Configure [owner] and [repo] in ') // 占位符保留,链接部分被缓冲 + expect(result1.hasBufferedContent).toBe(true) // [GitHub]( 被缓冲 + accumulatedText += result1.text + + // 第二个chunk:完成链接 + const result2 = convertLinks(chunks[1], false) + expect(result2.text).toBe('GitHub [1](https://github.com) settings page.') // 完整链接 + 剩余文本 + expect(result2.hasBufferedContent).toBe(false) + accumulatedText += result2.text + + // 验证最终结果 + expect(accumulatedText).toBe( + 'Configure [owner] and [repo] in GitHub [1](https://github.com) settings page.' + ) + expect(accumulatedText).toContain('[owner] and [repo]') // 占位符保持原样 + expect(accumulatedText).toContain('[1](https://github.com)') // 链接被转换 + }) + + it('should properly handle buffer flush at stream end', () => { + // 测试流结束时的buffer清理 + const incompleteChunk = 'Check the documentation at [GitHub](' + const result = convertLinks(incompleteChunk, true) + + // 应该有内容被缓冲 + expect(result.hasBufferedContent).toBe(true) + expect(result.text).toBe('Check the documentation at ') // 只返回安全部分 + + // 模拟流结束,强制清空buffer + const remainingText = flushLinkConverterBuffer() + expect(remainingText).toBe('[GitHub](') // buffer中的剩余内容 + }) }) }) diff --git a/src/renderer/src/utils/linkConverter.ts b/src/renderer/src/utils/linkConverter.ts index 238c88b10e..652c2f4283 100644 --- a/src/renderer/src/utils/linkConverter.ts +++ b/src/renderer/src/utils/linkConverter.ts @@ -126,9 +126,12 @@ export function convertLinksToHunyuan(text: string, webSearch: any[], resetCount * * @param {string} text The current chunk of text to process * @param {boolean} resetCounter Whether to reset the counter and buffer - * @returns {string} Processed text with complete links converted + * @returns {{text: string, hasBufferedContent: boolean}} Processed text and whether content was buffered */ -export function convertLinks(text: string, resetCounter: boolean = false): string { +export function convertLinks( + text: string, + resetCounter: boolean = false +): { text: string; hasBufferedContent: boolean } { if (resetCounter) { linkCounter = 1 buffer = '' @@ -158,12 +161,22 @@ export function convertLinks(text: string, resetCounter: boolean = false): strin } else if (buffer[i] === '[') { // Check if this could be the start of a regular link const substring = buffer.substring(i) - const match = /^\[([^\]]+)\]\(([^)]+)\)/.exec(substring) - if (!match) { + // 检查是否是真正的不完整链接:[text]( 但没有完整的 url) + const incompleteLink = /^\[([^\]]+)\]\s*\([^)]*$/.test(substring) + if (incompleteLink) { safePoint = i break } + + // 检查是否是完整的链接但需要验证 + const completeLink = /^\[([^\]]+)\]\(([^)]+)\)/.test(substring) + if (completeLink) { + // 如果是完整链接,继续处理,不设置safePoint + continue + } + + // 如果不是潜在的链接格式,继续检查 } } @@ -171,6 +184,9 @@ export function convertLinks(text: string, resetCounter: boolean = false): strin const safeBuffer = buffer.substring(0, safePoint) buffer = buffer.substring(safePoint) + // 检查是否有内容被保留在buffer中 + const hasBufferedContent = buffer.length > 0 + // Process the safe buffer to handle complete links let result = '' let position = 0 @@ -237,7 +253,10 @@ export function convertLinks(text: string, resetCounter: boolean = false): strin position++ } - return result + return { + text: result, + hasBufferedContent + } } /** @@ -439,13 +458,13 @@ export function extractWebSearchReferences(text: string): Array<{ * @param {any[]} webSearchResults Web搜索结果数组 * @param {string} providerType Provider类型 ('openai', 'zhipu', 'hunyuan', 'openrouter', etc.) * @param {boolean} resetCounter 是否重置计数器 - * @returns {string} 转换后的文本 + * @returns {{text: string, hasBufferedContent: boolean}} 转换后的文本和是否有内容被缓冲 */ export function smartLinkConverter( text: string, providerType: string = 'openai', resetCounter: boolean = false -): string { +): { text: string; hasBufferedContent: boolean } { // 检测文本中的引用模式 const references = extractWebSearchReferences(text) @@ -458,10 +477,26 @@ export function smartLinkConverter( const hasZhipuPattern = references.some((ref) => ref.placeholder.includes('ref_')) if (hasZhipuPattern) { - return convertLinksToZhipu(text, resetCounter) + return { + text: convertLinksToZhipu(text, resetCounter), + hasBufferedContent: false + } } else if (providerType === 'openrouter') { - return convertLinksToOpenRouter(text, resetCounter) + return { + text: convertLinksToOpenRouter(text, resetCounter), + hasBufferedContent: false + } } else { return convertLinks(text, resetCounter) } } + +/** + * 强制返回buffer中的所有内容,用于流结束时清空缓冲区 + * @returns {string} buffer中剩余的所有内容 + */ +export function flushLinkConverterBuffer(): string { + const remainingBuffer = buffer + buffer = '' + return remainingBuffer +}