mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2026-01-08 14:29:15 +08:00
refactor(WebSearchMiddleware, linkConverter): enhance link processing and buffering logic (#7724)
- Updated WebSearchMiddleware to utilize the new smartLinkConverter structure, allowing for better handling of buffered content and fallback logic. - Introduced flushLinkConverterBuffer function to clear remaining buffered content at stream end. - Modified convertLinks and smartLinkConverter functions to return structured results indicating whether content was buffered. - Enhanced unit tests to cover new functionality and edge cases for link conversion and buffering behavior.
This commit is contained in:
parent
f58378daa0
commit
83f36f5e77
@ -1,5 +1,5 @@
|
|||||||
import { ChunkType } from '@renderer/types/chunk'
|
import { ChunkType } from '@renderer/types/chunk'
|
||||||
import { smartLinkConverter } from '@renderer/utils/linkConverter'
|
import { flushLinkConverterBuffer, smartLinkConverter } from '@renderer/utils/linkConverter'
|
||||||
|
|
||||||
import { CompletionsParams, CompletionsResult, GenericChunk } from '../schemas'
|
import { CompletionsParams, CompletionsResult, GenericChunk } from '../schemas'
|
||||||
import { CompletionsContext, CompletionsMiddleware } from '../types'
|
import { CompletionsContext, CompletionsMiddleware } from '../types'
|
||||||
@ -42,20 +42,46 @@ export const WebSearchMiddleware: CompletionsMiddleware =
|
|||||||
const providerType = model.provider || 'openai'
|
const providerType = model.provider || 'openai'
|
||||||
// 使用当前可用的Web搜索结果进行链接转换
|
// 使用当前可用的Web搜索结果进行链接转换
|
||||||
const text = chunk.text
|
const text = chunk.text
|
||||||
const processedText = smartLinkConverter(text, providerType, isFirstChunk)
|
const result = smartLinkConverter(text, providerType, isFirstChunk)
|
||||||
if (isFirstChunk) {
|
if (isFirstChunk) {
|
||||||
isFirstChunk = false
|
isFirstChunk = false
|
||||||
}
|
}
|
||||||
controller.enqueue({
|
|
||||||
...chunk,
|
// - 如果有内容被缓冲,说明convertLinks正在等待后续chunk,不使用原文本避免重复
|
||||||
text: processedText
|
// - 如果没有内容被缓冲且结果为空,可能是其他处理问题,使用原文本作为安全回退
|
||||||
})
|
let finalText: string
|
||||||
|
if (result.hasBufferedContent) {
|
||||||
|
// 有内容被缓冲,使用处理后的结果(可能为空,等待后续chunk)
|
||||||
|
finalText = result.text
|
||||||
|
} else {
|
||||||
|
// 没有内容被缓冲,可以安全使用回退逻辑
|
||||||
|
finalText = result.text || text
|
||||||
|
}
|
||||||
|
|
||||||
|
// 只有当finalText不为空时才发送chunk
|
||||||
|
if (finalText) {
|
||||||
|
controller.enqueue({
|
||||||
|
...chunk,
|
||||||
|
text: finalText
|
||||||
|
})
|
||||||
|
}
|
||||||
} else if (chunk.type === ChunkType.LLM_WEB_SEARCH_COMPLETE) {
|
} else if (chunk.type === ChunkType.LLM_WEB_SEARCH_COMPLETE) {
|
||||||
// 暂存Web搜索结果用于链接完善
|
// 暂存Web搜索结果用于链接完善
|
||||||
ctx._internal.webSearchState!.results = chunk.llm_web_search
|
ctx._internal.webSearchState!.results = chunk.llm_web_search
|
||||||
|
|
||||||
// 将Web搜索完成事件继续传递下去
|
// 将Web搜索完成事件继续传递下去
|
||||||
controller.enqueue(chunk)
|
controller.enqueue(chunk)
|
||||||
|
} else if (chunk.type === ChunkType.LLM_RESPONSE_COMPLETE) {
|
||||||
|
// 流结束时,清空链接转换器的buffer并处理剩余内容
|
||||||
|
const remainingText = flushLinkConverterBuffer()
|
||||||
|
if (remainingText) {
|
||||||
|
controller.enqueue({
|
||||||
|
type: ChunkType.TEXT_DELTA,
|
||||||
|
text: remainingText
|
||||||
|
})
|
||||||
|
}
|
||||||
|
// 继续传递LLM_RESPONSE_COMPLETE事件
|
||||||
|
controller.enqueue(chunk)
|
||||||
} else {
|
} else {
|
||||||
controller.enqueue(chunk)
|
controller.enqueue(chunk)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,7 +7,8 @@ import {
|
|||||||
convertLinksToHunyuan,
|
convertLinksToHunyuan,
|
||||||
convertLinksToOpenRouter,
|
convertLinksToOpenRouter,
|
||||||
convertLinksToZhipu,
|
convertLinksToZhipu,
|
||||||
extractUrlsFromMarkdown
|
extractUrlsFromMarkdown,
|
||||||
|
flushLinkConverterBuffer
|
||||||
} from '../linkConverter'
|
} from '../linkConverter'
|
||||||
|
|
||||||
describe('linkConverter', () => {
|
describe('linkConverter', () => {
|
||||||
@ -90,22 +91,197 @@ describe('linkConverter', () => {
|
|||||||
it('should convert links with domain-like text to numbered links', () => {
|
it('should convert links with domain-like text to numbered links', () => {
|
||||||
const input = '查看这个网站 [example.com](https://example.com)'
|
const input = '查看这个网站 [example.com](https://example.com)'
|
||||||
const result = convertLinks(input, true)
|
const result = convertLinks(input, true)
|
||||||
expect(result).toBe('查看这个网站 [<sup>1</sup>](https://example.com)')
|
expect(result.text).toBe('查看这个网站 [<sup>1</sup>](https://example.com)')
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
})
|
})
|
||||||
|
|
||||||
it('should handle parenthesized link format ([host](url))', () => {
|
it('should handle parenthesized link format ([host](url))', () => {
|
||||||
const input = '这里有链接 ([example.com](https://example.com))'
|
const input = '这里有链接 ([example.com](https://example.com))'
|
||||||
const result = convertLinks(input, true)
|
const result = convertLinks(input, true)
|
||||||
expect(result).toBe('这里有链接 [<sup>1</sup>](https://example.com)')
|
expect(result.text).toBe('这里有链接 [<sup>1</sup>](https://example.com)')
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
})
|
})
|
||||||
|
|
||||||
it('should use the same counter for duplicate URLs', () => {
|
it('should use the same counter for duplicate URLs', () => {
|
||||||
const input =
|
const input =
|
||||||
'第一个链接 [example.com](https://example.com) 和第二个相同链接 [subdomain.example.com](https://example.com)'
|
'第一个链接 [example.com](https://example.com) 和第二个相同链接 [subdomain.example.com](https://example.com)'
|
||||||
const result = convertLinks(input, true)
|
const result = convertLinks(input, true)
|
||||||
expect(result).toBe(
|
expect(result.text).toBe(
|
||||||
'第一个链接 [<sup>1</sup>](https://example.com) 和第二个相同链接 [<sup>1</sup>](https://example.com)'
|
'第一个链接 [<sup>1</sup>](https://example.com) 和第二个相同链接 [<sup>1</sup>](https://example.com)'
|
||||||
)
|
)
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should not misinterpret code placeholders as incomplete links', () => {
|
||||||
|
const input =
|
||||||
|
'The most common reason for a `404` error is that the repository specified in the `owner` and `repo`'
|
||||||
|
const result = convertLinks(input, true)
|
||||||
|
expect(result.text).toBe(
|
||||||
|
'The most common reason for a `404` error is that the repository specified in the `owner` and `repo`'
|
||||||
|
)
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should handle text with square brackets that are not links', () => {
|
||||||
|
const input = 'Use [owner] and [repo] placeholders in your configuration [file]'
|
||||||
|
const result = convertLinks(input, true)
|
||||||
|
expect(result.text).toBe('Use [owner] and [repo] placeholders in your configuration [file]')
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should handle markdown code blocks with square brackets', () => {
|
||||||
|
const input = 'In the code: `const config = { [key]: value }` you can see [brackets]'
|
||||||
|
const result = convertLinks(input, true)
|
||||||
|
expect(result.text).toBe('In the code: `const config = { [key]: value }` you can see [brackets]')
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should properly handle partial markdown link patterns', () => {
|
||||||
|
// 这种情况下,[text] 后面没有紧跟 (,所以不应该被当作潜在链接
|
||||||
|
const input = 'Check the [documentation] for more details'
|
||||||
|
const result = convertLinks(input, true)
|
||||||
|
expect(result.text).toBe('Check the [documentation] for more details')
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should correctly identify and handle real incomplete links', () => {
|
||||||
|
// 第一个块包含真正的不完整链接模式
|
||||||
|
const chunk1 = 'Visit [example.com]('
|
||||||
|
const result1 = convertLinks(chunk1, true)
|
||||||
|
expect(result1.text).toBe('Visit ')
|
||||||
|
expect(result1.hasBufferedContent).toBe(true)
|
||||||
|
|
||||||
|
// 第二个块完成该链接
|
||||||
|
const chunk2 = 'https://example.com) for more info'
|
||||||
|
const result2 = convertLinks(chunk2, false)
|
||||||
|
expect(result2.text).toBe('[<sup>1</sup>](https://example.com) for more info')
|
||||||
|
expect(result2.hasBufferedContent).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should handle mixed content with real links and placeholders', () => {
|
||||||
|
const input = 'Configure [owner] and [repo] in [GitHub](https://github.com) settings'
|
||||||
|
const result = convertLinks(input, true)
|
||||||
|
expect(result.text).toBe('Configure [owner] and [repo] in GitHub [<sup>1</sup>](https://github.com) settings')
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should handle empty text', () => {
|
||||||
|
const input = ''
|
||||||
|
const result = convertLinks(input, true)
|
||||||
|
expect(result.text).toBe('')
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should handle text with only square brackets', () => {
|
||||||
|
const input = '[][][]'
|
||||||
|
const result = convertLinks(input, true)
|
||||||
|
expect(result.text).toBe('[][][]')
|
||||||
|
expect(result.hasBufferedContent).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('streaming small chunks simulation', () => {
|
||||||
|
it('should handle non-link placeholders in small chunks without buffering', () => {
|
||||||
|
// 模拟用户遇到的问题:包含方括号占位符的文本被分成小chunks
|
||||||
|
const chunks = [
|
||||||
|
'The most common reason for a `404` error is that the repository specified in the `',
|
||||||
|
'owner` and `',
|
||||||
|
'repo` parameters are incorrect.'
|
||||||
|
]
|
||||||
|
|
||||||
|
let accumulatedText = ''
|
||||||
|
|
||||||
|
// 第一个chunk
|
||||||
|
const result1 = convertLinks(chunks[0], true)
|
||||||
|
expect(result1.text).toBe(chunks[0]) // 应该立即返回,不缓冲
|
||||||
|
expect(result1.hasBufferedContent).toBe(false)
|
||||||
|
accumulatedText += result1.text
|
||||||
|
|
||||||
|
// 第二个chunk
|
||||||
|
const result2 = convertLinks(chunks[1], false)
|
||||||
|
expect(result2.text).toBe(chunks[1]) // 应该立即返回,不缓冲
|
||||||
|
expect(result2.hasBufferedContent).toBe(false)
|
||||||
|
accumulatedText += result2.text
|
||||||
|
|
||||||
|
// 第三个chunk
|
||||||
|
const result3 = convertLinks(chunks[2], false)
|
||||||
|
expect(result3.text).toBe(chunks[2]) // 应该立即返回,不缓冲
|
||||||
|
expect(result3.hasBufferedContent).toBe(false)
|
||||||
|
accumulatedText += result3.text
|
||||||
|
|
||||||
|
// 验证最终结果
|
||||||
|
expect(accumulatedText).toBe(chunks.join(''))
|
||||||
|
expect(accumulatedText).toBe(
|
||||||
|
'The most common reason for a `404` error is that the repository specified in the `owner` and `repo` parameters are incorrect.'
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should handle real links split across small chunks with proper buffering', () => {
|
||||||
|
// 模拟真实链接被分割成小chunks的情况 - 更现实的分割方式
|
||||||
|
const chunks = [
|
||||||
|
'Please visit [example.com](', // 不完整链接
|
||||||
|
'https://example.com) for details' // 完成链接
|
||||||
|
]
|
||||||
|
|
||||||
|
let accumulatedText = ''
|
||||||
|
|
||||||
|
// 第一个chunk:包含不完整链接 [text](
|
||||||
|
const result1 = convertLinks(chunks[0], true)
|
||||||
|
expect(result1.text).toBe('Please visit ') // 只返回安全部分
|
||||||
|
expect(result1.hasBufferedContent).toBe(true) // [example.com]( 被缓冲
|
||||||
|
accumulatedText += result1.text
|
||||||
|
|
||||||
|
// 第二个chunk:完成链接
|
||||||
|
const result2 = convertLinks(chunks[1], false)
|
||||||
|
expect(result2.text).toBe('[<sup>1</sup>](https://example.com) for details') // 完整链接 + 剩余文本
|
||||||
|
expect(result2.hasBufferedContent).toBe(false)
|
||||||
|
accumulatedText += result2.text
|
||||||
|
|
||||||
|
// 验证最终结果
|
||||||
|
expect(accumulatedText).toBe('Please visit [<sup>1</sup>](https://example.com) for details')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should handle mixed content with placeholders and real links in small chunks', () => {
|
||||||
|
// 混合内容:既有占位符又有真实链接 - 更现实的分割方式
|
||||||
|
const chunks = [
|
||||||
|
'Configure [owner] and [repo] in [GitHub](', // 占位符 + 不完整链接
|
||||||
|
'https://github.com) settings page.' // 完成链接
|
||||||
|
]
|
||||||
|
|
||||||
|
let accumulatedText = ''
|
||||||
|
|
||||||
|
// 第一个chunk:包含占位符和不完整链接
|
||||||
|
const result1 = convertLinks(chunks[0], true)
|
||||||
|
expect(result1.text).toBe('Configure [owner] and [repo] in ') // 占位符保留,链接部分被缓冲
|
||||||
|
expect(result1.hasBufferedContent).toBe(true) // [GitHub]( 被缓冲
|
||||||
|
accumulatedText += result1.text
|
||||||
|
|
||||||
|
// 第二个chunk:完成链接
|
||||||
|
const result2 = convertLinks(chunks[1], false)
|
||||||
|
expect(result2.text).toBe('GitHub [<sup>1</sup>](https://github.com) settings page.') // 完整链接 + 剩余文本
|
||||||
|
expect(result2.hasBufferedContent).toBe(false)
|
||||||
|
accumulatedText += result2.text
|
||||||
|
|
||||||
|
// 验证最终结果
|
||||||
|
expect(accumulatedText).toBe(
|
||||||
|
'Configure [owner] and [repo] in GitHub [<sup>1</sup>](https://github.com) settings page.'
|
||||||
|
)
|
||||||
|
expect(accumulatedText).toContain('[owner] and [repo]') // 占位符保持原样
|
||||||
|
expect(accumulatedText).toContain('[<sup>1</sup>](https://github.com)') // 链接被转换
|
||||||
|
})
|
||||||
|
|
||||||
|
it('should properly handle buffer flush at stream end', () => {
|
||||||
|
// 测试流结束时的buffer清理
|
||||||
|
const incompleteChunk = 'Check the documentation at [GitHub]('
|
||||||
|
const result = convertLinks(incompleteChunk, true)
|
||||||
|
|
||||||
|
// 应该有内容被缓冲
|
||||||
|
expect(result.hasBufferedContent).toBe(true)
|
||||||
|
expect(result.text).toBe('Check the documentation at ') // 只返回安全部分
|
||||||
|
|
||||||
|
// 模拟流结束,强制清空buffer
|
||||||
|
const remainingText = flushLinkConverterBuffer()
|
||||||
|
expect(remainingText).toBe('[GitHub](') // buffer中的剩余内容
|
||||||
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@ -126,9 +126,12 @@ export function convertLinksToHunyuan(text: string, webSearch: any[], resetCount
|
|||||||
*
|
*
|
||||||
* @param {string} text The current chunk of text to process
|
* @param {string} text The current chunk of text to process
|
||||||
* @param {boolean} resetCounter Whether to reset the counter and buffer
|
* @param {boolean} resetCounter Whether to reset the counter and buffer
|
||||||
* @returns {string} Processed text with complete links converted
|
* @returns {{text: string, hasBufferedContent: boolean}} Processed text and whether content was buffered
|
||||||
*/
|
*/
|
||||||
export function convertLinks(text: string, resetCounter: boolean = false): string {
|
export function convertLinks(
|
||||||
|
text: string,
|
||||||
|
resetCounter: boolean = false
|
||||||
|
): { text: string; hasBufferedContent: boolean } {
|
||||||
if (resetCounter) {
|
if (resetCounter) {
|
||||||
linkCounter = 1
|
linkCounter = 1
|
||||||
buffer = ''
|
buffer = ''
|
||||||
@ -158,12 +161,22 @@ export function convertLinks(text: string, resetCounter: boolean = false): strin
|
|||||||
} else if (buffer[i] === '[') {
|
} else if (buffer[i] === '[') {
|
||||||
// Check if this could be the start of a regular link
|
// Check if this could be the start of a regular link
|
||||||
const substring = buffer.substring(i)
|
const substring = buffer.substring(i)
|
||||||
const match = /^\[([^\]]+)\]\(([^)]+)\)/.exec(substring)
|
|
||||||
|
|
||||||
if (!match) {
|
// 检查是否是真正的不完整链接:[text]( 但没有完整的 url)
|
||||||
|
const incompleteLink = /^\[([^\]]+)\]\s*\([^)]*$/.test(substring)
|
||||||
|
if (incompleteLink) {
|
||||||
safePoint = i
|
safePoint = i
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 检查是否是完整的链接但需要验证
|
||||||
|
const completeLink = /^\[([^\]]+)\]\(([^)]+)\)/.test(substring)
|
||||||
|
if (completeLink) {
|
||||||
|
// 如果是完整链接,继续处理,不设置safePoint
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// 如果不是潜在的链接格式,继续检查
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -171,6 +184,9 @@ export function convertLinks(text: string, resetCounter: boolean = false): strin
|
|||||||
const safeBuffer = buffer.substring(0, safePoint)
|
const safeBuffer = buffer.substring(0, safePoint)
|
||||||
buffer = buffer.substring(safePoint)
|
buffer = buffer.substring(safePoint)
|
||||||
|
|
||||||
|
// 检查是否有内容被保留在buffer中
|
||||||
|
const hasBufferedContent = buffer.length > 0
|
||||||
|
|
||||||
// Process the safe buffer to handle complete links
|
// Process the safe buffer to handle complete links
|
||||||
let result = ''
|
let result = ''
|
||||||
let position = 0
|
let position = 0
|
||||||
@ -237,7 +253,10 @@ export function convertLinks(text: string, resetCounter: boolean = false): strin
|
|||||||
position++
|
position++
|
||||||
}
|
}
|
||||||
|
|
||||||
return result
|
return {
|
||||||
|
text: result,
|
||||||
|
hasBufferedContent
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -439,13 +458,13 @@ export function extractWebSearchReferences(text: string): Array<{
|
|||||||
* @param {any[]} webSearchResults Web搜索结果数组
|
* @param {any[]} webSearchResults Web搜索结果数组
|
||||||
* @param {string} providerType Provider类型 ('openai', 'zhipu', 'hunyuan', 'openrouter', etc.)
|
* @param {string} providerType Provider类型 ('openai', 'zhipu', 'hunyuan', 'openrouter', etc.)
|
||||||
* @param {boolean} resetCounter 是否重置计数器
|
* @param {boolean} resetCounter 是否重置计数器
|
||||||
* @returns {string} 转换后的文本
|
* @returns {{text: string, hasBufferedContent: boolean}} 转换后的文本和是否有内容被缓冲
|
||||||
*/
|
*/
|
||||||
export function smartLinkConverter(
|
export function smartLinkConverter(
|
||||||
text: string,
|
text: string,
|
||||||
providerType: string = 'openai',
|
providerType: string = 'openai',
|
||||||
resetCounter: boolean = false
|
resetCounter: boolean = false
|
||||||
): string {
|
): { text: string; hasBufferedContent: boolean } {
|
||||||
// 检测文本中的引用模式
|
// 检测文本中的引用模式
|
||||||
const references = extractWebSearchReferences(text)
|
const references = extractWebSearchReferences(text)
|
||||||
|
|
||||||
@ -458,10 +477,26 @@ export function smartLinkConverter(
|
|||||||
const hasZhipuPattern = references.some((ref) => ref.placeholder.includes('ref_'))
|
const hasZhipuPattern = references.some((ref) => ref.placeholder.includes('ref_'))
|
||||||
|
|
||||||
if (hasZhipuPattern) {
|
if (hasZhipuPattern) {
|
||||||
return convertLinksToZhipu(text, resetCounter)
|
return {
|
||||||
|
text: convertLinksToZhipu(text, resetCounter),
|
||||||
|
hasBufferedContent: false
|
||||||
|
}
|
||||||
} else if (providerType === 'openrouter') {
|
} else if (providerType === 'openrouter') {
|
||||||
return convertLinksToOpenRouter(text, resetCounter)
|
return {
|
||||||
|
text: convertLinksToOpenRouter(text, resetCounter),
|
||||||
|
hasBufferedContent: false
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
return convertLinks(text, resetCounter)
|
return convertLinks(text, resetCounter)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 强制返回buffer中的所有内容,用于流结束时清空缓冲区
|
||||||
|
* @returns {string} buffer中剩余的所有内容
|
||||||
|
*/
|
||||||
|
export function flushLinkConverterBuffer(): string {
|
||||||
|
const remainingBuffer = buffer
|
||||||
|
buffer = ''
|
||||||
|
return remainingBuffer
|
||||||
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user