refactor(WebSearchMiddleware, linkConverter): enhance link processing and buffering logic (#7724)

- Updated WebSearchMiddleware to utilize the new smartLinkConverter structure, allowing for better handling of buffered content and fallback logic. - Introduced flushLinkConverterBuffer function to clear remaining buffered content at stream end. - Modified convertLinks and smartLinkConverter functions to return structured results indicating whether content was buffered. - Enhanced unit tests to cover new functionality and edge cases for link conversion and buffering behavior.
2025-12-26 03:31:24 +08:00 · 2025-07-02 03:03:03 +08:00 · 2025-07-02 03:03:03 +08:00 · 83f36f5e77
commit 83f36f5e77
parent f58378daa0
3 changed files with 256 additions and 19 deletions
--- a/src/renderer/src/aiCore/middleware/core/WebSearchMiddleware.ts
+++ b/src/renderer/src/aiCore/middleware/core/WebSearchMiddleware.ts
@ -1,5 +1,5 @@
 import { ChunkType } from '@renderer/types/chunk'
-import { smartLinkConverter } from '@renderer/utils/linkConverter'
+import { flushLinkConverterBuffer, smartLinkConverter } from '@renderer/utils/linkConverter'

 import { CompletionsParams, CompletionsResult, GenericChunk } from '../schemas'
 import { CompletionsContext, CompletionsMiddleware } from '../types'
@ -42,20 +42,46 @@ export const WebSearchMiddleware: CompletionsMiddleware =
                const providerType = model.provider || 'openai'
                // 使用当前可用的Web搜索结果进行链接转换
                const text = chunk.text
-                const processedText = smartLinkConverter(text, providerType, isFirstChunk)
+                const result = smartLinkConverter(text, providerType, isFirstChunk)
                if (isFirstChunk) {
                  isFirstChunk = false
                }
-                controller.enqueue({
-                  ...chunk,
-                  text: processedText
-                })
+
+                // - 如果有内容被缓冲，说明convertLinks正在等待后续chunk，不使用原文本避免重复
+                // - 如果没有内容被缓冲且结果为空，可能是其他处理问题，使用原文本作为安全回退
+                let finalText: string
+                if (result.hasBufferedContent) {
+                  // 有内容被缓冲，使用处理后的结果（可能为空，等待后续chunk）
+                  finalText = result.text
+                } else {
+                  // 没有内容被缓冲，可以安全使用回退逻辑
+                  finalText = result.text || text
+                }
+
+                // 只有当finalText不为空时才发送chunk
+                if (finalText) {
+                  controller.enqueue({
+                    ...chunk,
+                    text: finalText
+                  })
+                }
              } else if (chunk.type === ChunkType.LLM_WEB_SEARCH_COMPLETE) {
                // 暂存Web搜索结果用于链接完善
                ctx._internal.webSearchState!.results = chunk.llm_web_search

                // 将Web搜索完成事件继续传递下去
                controller.enqueue(chunk)
+              } else if (chunk.type === ChunkType.LLM_RESPONSE_COMPLETE) {
+                // 流结束时，清空链接转换器的buffer并处理剩余内容
+                const remainingText = flushLinkConverterBuffer()
+                if (remainingText) {
+                  controller.enqueue({
+                    type: ChunkType.TEXT_DELTA,
+                    text: remainingText
+                  })
+                }
+                // 继续传递LLM_RESPONSE_COMPLETE事件
+                controller.enqueue(chunk)
              } else {
                controller.enqueue(chunk)
              }
--- a/src/renderer/src/utils/tests/linkConverter.test.ts
+++ b/src/renderer/src/utils/tests/linkConverter.test.ts
@ -7,7 +7,8 @@ import {
  convertLinksToHunyuan,
  convertLinksToOpenRouter,
  convertLinksToZhipu,
-  extractUrlsFromMarkdown
+  extractUrlsFromMarkdown,
+  flushLinkConverterBuffer
 } from '../linkConverter'

 describe('linkConverter', () => {
@ -90,22 +91,197 @@ describe('linkConverter', () => {
    it('should convert links with domain-like text to numbered links', () => {
      const input = '查看这个网站 [example.com](https://example.com)'
      const result = convertLinks(input, true)
-      expect(result).toBe('查看这个网站 [<sup>1</sup>](https://example.com)')
+      expect(result.text).toBe('查看这个网站 [<sup>1</sup>](https://example.com)')
+      expect(result.hasBufferedContent).toBe(false)
    })

    it('should handle parenthesized link format ([host](url))', () => {
      const input = '这里有链接 ([example.com](https://example.com))'
      const result = convertLinks(input, true)
-      expect(result).toBe('这里有链接 [<sup>1</sup>](https://example.com)')
+      expect(result.text).toBe('这里有链接 [<sup>1</sup>](https://example.com)')
+      expect(result.hasBufferedContent).toBe(false)
    })

    it('should use the same counter for duplicate URLs', () => {
      const input =
        '第一个链接 [example.com](https://example.com) 和第二个相同链接 [subdomain.example.com](https://example.com)'
      const result = convertLinks(input, true)
-      expect(result).toBe(
+      expect(result.text).toBe(
        '第一个链接 [<sup>1</sup>](https://example.com) 和第二个相同链接 [<sup>1</sup>](https://example.com)'
      )
+      expect(result.hasBufferedContent).toBe(false)
+    })
+
+    it('should not misinterpret code placeholders as incomplete links', () => {
+      const input =
+        'The most common reason for a `404` error is that the repository specified in the `owner` and `repo`'
+      const result = convertLinks(input, true)
+      expect(result.text).toBe(
+        'The most common reason for a `404` error is that the repository specified in the `owner` and `repo`'
+      )
+      expect(result.hasBufferedContent).toBe(false)
+    })
+
+    it('should handle text with square brackets that are not links', () => {
+      const input = 'Use [owner] and [repo] placeholders in your configuration [file]'
+      const result = convertLinks(input, true)
+      expect(result.text).toBe('Use [owner] and [repo] placeholders in your configuration [file]')
+      expect(result.hasBufferedContent).toBe(false)
+    })
+
+    it('should handle markdown code blocks with square brackets', () => {
+      const input = 'In the code: `const config = { [key]: value }` you can see [brackets]'
+      const result = convertLinks(input, true)
+      expect(result.text).toBe('In the code: `const config = { [key]: value }` you can see [brackets]')
+      expect(result.hasBufferedContent).toBe(false)
+    })
+
+    it('should properly handle partial markdown link patterns', () => {
+      // 这种情况下，[text] 后面没有紧跟 (，所以不应该被当作潜在链接
+      const input = 'Check the [documentation] for more details'
+      const result = convertLinks(input, true)
+      expect(result.text).toBe('Check the [documentation] for more details')
+      expect(result.hasBufferedContent).toBe(false)
+    })
+
+    it('should correctly identify and handle real incomplete links', () => {
+      // 第一个块包含真正的不完整链接模式
+      const chunk1 = 'Visit [example.com]('
+      const result1 = convertLinks(chunk1, true)
+      expect(result1.text).toBe('Visit ')
+      expect(result1.hasBufferedContent).toBe(true)
+
+      // 第二个块完成该链接
+      const chunk2 = 'https://example.com) for more info'
+      const result2 = convertLinks(chunk2, false)
+      expect(result2.text).toBe('[<sup>1</sup>](https://example.com) for more info')
+      expect(result2.hasBufferedContent).toBe(false)
+    })
+
+    it('should handle mixed content with real links and placeholders', () => {
+      const input = 'Configure [owner] and [repo] in [GitHub](https://github.com) settings'
+      const result = convertLinks(input, true)
+      expect(result.text).toBe('Configure [owner] and [repo] in GitHub [<sup>1</sup>](https://github.com) settings')
+      expect(result.hasBufferedContent).toBe(false)
+    })
+
+    it('should handle empty text', () => {
+      const input = ''
+      const result = convertLinks(input, true)
+      expect(result.text).toBe('')
+      expect(result.hasBufferedContent).toBe(false)
+    })
+
+    it('should handle text with only square brackets', () => {
+      const input = '[][][]'
+      const result = convertLinks(input, true)
+      expect(result.text).toBe('[][][]')
+      expect(result.hasBufferedContent).toBe(false)
+    })
+
+    describe('streaming small chunks simulation', () => {
+      it('should handle non-link placeholders in small chunks without buffering', () => {
+        // 模拟用户遇到的问题：包含方括号占位符的文本被分成小chunks
+        const chunks = [
+          'The most common reason for a `404` error is that the repository specified in the `',
+          'owner` and `',
+          'repo` parameters are incorrect.'
+        ]
+
+        let accumulatedText = ''
+
+        // 第一个chunk
+        const result1 = convertLinks(chunks[0], true)
+        expect(result1.text).toBe(chunks[0]) // 应该立即返回，不缓冲
+        expect(result1.hasBufferedContent).toBe(false)
+        accumulatedText += result1.text
+
+        // 第二个chunk
+        const result2 = convertLinks(chunks[1], false)
+        expect(result2.text).toBe(chunks[1]) // 应该立即返回，不缓冲
+        expect(result2.hasBufferedContent).toBe(false)
+        accumulatedText += result2.text
+
+        // 第三个chunk
+        const result3 = convertLinks(chunks[2], false)
+        expect(result3.text).toBe(chunks[2]) // 应该立即返回，不缓冲
+        expect(result3.hasBufferedContent).toBe(false)
+        accumulatedText += result3.text
+
+        // 验证最终结果
+        expect(accumulatedText).toBe(chunks.join(''))
+        expect(accumulatedText).toBe(
+          'The most common reason for a `404` error is that the repository specified in the `owner` and `repo` parameters are incorrect.'
+        )
+      })
+
+      it('should handle real links split across small chunks with proper buffering', () => {
+        // 模拟真实链接被分割成小chunks的情况 - 更现实的分割方式
+        const chunks = [
+          'Please visit [example.com](', // 不完整链接
+          'https://example.com) for details' // 完成链接
+        ]
+
+        let accumulatedText = ''
+
+        // 第一个chunk：包含不完整链接 [text](
+        const result1 = convertLinks(chunks[0], true)
+        expect(result1.text).toBe('Please visit ') // 只返回安全部分
+        expect(result1.hasBufferedContent).toBe(true) // [example.com]( 被缓冲
+        accumulatedText += result1.text
+
+        // 第二个chunk：完成链接
+        const result2 = convertLinks(chunks[1], false)
+        expect(result2.text).toBe('[<sup>1</sup>](https://example.com) for details') // 完整链接 + 剩余文本
+        expect(result2.hasBufferedContent).toBe(false)
+        accumulatedText += result2.text
+
+        // 验证最终结果
+        expect(accumulatedText).toBe('Please visit [<sup>1</sup>](https://example.com) for details')
+      })
+
+      it('should handle mixed content with placeholders and real links in small chunks', () => {
+        // 混合内容：既有占位符又有真实链接 - 更现实的分割方式
+        const chunks = [
+          'Configure [owner] and [repo] in [GitHub](', // 占位符 + 不完整链接
+          'https://github.com) settings page.' // 完成链接
+        ]
+
+        let accumulatedText = ''
+
+        // 第一个chunk：包含占位符和不完整链接
+        const result1 = convertLinks(chunks[0], true)
+        expect(result1.text).toBe('Configure [owner] and [repo] in ') // 占位符保留，链接部分被缓冲
+        expect(result1.hasBufferedContent).toBe(true) // [GitHub]( 被缓冲
+        accumulatedText += result1.text
+
+        // 第二个chunk：完成链接
+        const result2 = convertLinks(chunks[1], false)
+        expect(result2.text).toBe('GitHub [<sup>1</sup>](https://github.com) settings page.') // 完整链接 + 剩余文本
+        expect(result2.hasBufferedContent).toBe(false)
+        accumulatedText += result2.text
+
+        // 验证最终结果
+        expect(accumulatedText).toBe(
+          'Configure [owner] and [repo] in GitHub [<sup>1</sup>](https://github.com) settings page.'
+        )
+        expect(accumulatedText).toContain('[owner] and [repo]') // 占位符保持原样
+        expect(accumulatedText).toContain('[<sup>1</sup>](https://github.com)') // 链接被转换
+      })
+
+      it('should properly handle buffer flush at stream end', () => {
+        // 测试流结束时的buffer清理
+        const incompleteChunk = 'Check the documentation at [GitHub]('
+        const result = convertLinks(incompleteChunk, true)
+
+        // 应该有内容被缓冲
+        expect(result.hasBufferedContent).toBe(true)
+        expect(result.text).toBe('Check the documentation at ') // 只返回安全部分
+
+        // 模拟流结束，强制清空buffer
+        const remainingText = flushLinkConverterBuffer()
+        expect(remainingText).toBe('[GitHub](') // buffer中的剩余内容
+      })
    })
  })

--- a/src/renderer/src/utils/linkConverter.ts
+++ b/src/renderer/src/utils/linkConverter.ts
@ -126,9 +126,12 @@ export function convertLinksToHunyuan(text: string, webSearch: any[], resetCount
 *
 * @param {string} text The current chunk of text to process
 * @param {boolean} resetCounter Whether to reset the counter and buffer
- * @returns {string} Processed text with complete links converted
+ * @returns {{text: string, hasBufferedContent: boolean}} Processed text and whether content was buffered
 */
-export function convertLinks(text: string, resetCounter: boolean = false): string {
+export function convertLinks(
+  text: string,
+  resetCounter: boolean = false
+): { text: string; hasBufferedContent: boolean } {
  if (resetCounter) {
    linkCounter = 1
    buffer = ''
@ -158,12 +161,22 @@ export function convertLinks(text: string, resetCounter: boolean = false): strin
    } else if (buffer[i] === '[') {
      // Check if this could be the start of a regular link
      const substring = buffer.substring(i)
-      const match = /^\[([^\]]+)\]\(([^)]+)\)/.exec(substring)

-      if (!match) {
+      // 检查是否是真正的不完整链接：[text]( 但没有完整的 url)
+      const incompleteLink = /^\[([^\]]+)\]\s*\([^)]*$/.test(substring)
+      if (incompleteLink) {
        safePoint = i
        break
      }
+
+      // 检查是否是完整的链接但需要验证
+      const completeLink = /^\[([^\]]+)\]\(([^)]+)\)/.test(substring)
+      if (completeLink) {
+        // 如果是完整链接，继续处理，不设置safePoint
+        continue
+      }
+
+      // 如果不是潜在的链接格式，继续检查
    }
  }

@ -171,6 +184,9 @@ export function convertLinks(text: string, resetCounter: boolean = false): strin
  const safeBuffer = buffer.substring(0, safePoint)
  buffer = buffer.substring(safePoint)

+  // 检查是否有内容被保留在buffer中
+  const hasBufferedContent = buffer.length > 0
+
  // Process the safe buffer to handle complete links
  let result = ''
  let position = 0
@ -237,7 +253,10 @@ export function convertLinks(text: string, resetCounter: boolean = false): strin
    position++
  }

-  return result
+  return {
+    text: result,
+    hasBufferedContent
+  }
 }

 /**
@ -439,13 +458,13 @@ export function extractWebSearchReferences(text: string): Array<{
 * @param {any[]} webSearchResults Web搜索结果数组
 * @param {string} providerType Provider类型 ('openai', 'zhipu', 'hunyuan', 'openrouter', etc.)
 * @param {boolean} resetCounter 是否重置计数器
- * @returns {string} 转换后的文本
+ * @returns {{text: string, hasBufferedContent: boolean}} 转换后的文本和是否有内容被缓冲
 */
 export function smartLinkConverter(
  text: string,
  providerType: string = 'openai',
  resetCounter: boolean = false
-): string {
+): { text: string; hasBufferedContent: boolean } {
  // 检测文本中的引用模式
  const references = extractWebSearchReferences(text)

@ -458,10 +477,26 @@ export function smartLinkConverter(
  const hasZhipuPattern = references.some((ref) => ref.placeholder.includes('ref_'))

  if (hasZhipuPattern) {
-    return convertLinksToZhipu(text, resetCounter)
+    return {
+      text: convertLinksToZhipu(text, resetCounter),
+      hasBufferedContent: false
+    }
  } else if (providerType === 'openrouter') {
-    return convertLinksToOpenRouter(text, resetCounter)
+    return {
+      text: convertLinksToOpenRouter(text, resetCounter),
+      hasBufferedContent: false
+    }
  } else {
    return convertLinks(text, resetCounter)
  }
 }
+
+/**
+ * 强制返回buffer中的所有内容，用于流结束时清空缓冲区
+ * @returns {string} buffer中剩余的所有内容
+ */
+export function flushLinkConverterBuffer(): string {
+  const remainingBuffer = buffer
+  buffer = ''
+  return remainingBuffer
+}