diff --git a/src/renderer/src/components/WebSearchInitializer.tsx b/src/renderer/src/components/WebSearchInitializer.tsx index ea710d6500..48026dc3fb 100644 --- a/src/renderer/src/components/WebSearchInitializer.tsx +++ b/src/renderer/src/components/WebSearchInitializer.tsx @@ -22,9 +22,9 @@ const WebSearchInitializer = () => { const deepSearchProvider: WebSearchProvider = { id: 'deep-search', name: 'DeepSearch', - description: '多引擎深度搜索', usingBrowser: true, - contentLimit: 10000 + contentLimit: 10000, + description: '多引擎深度搜索' } dispatch(addWebSearchProvider(deepSearchProvider)) } diff --git a/src/renderer/src/providers/WebSearchProvider/DeepSearchProvider.ts b/src/renderer/src/providers/WebSearchProvider/DeepSearchProvider.ts index b9e139cc73..b91ce938bc 100644 --- a/src/renderer/src/providers/WebSearchProvider/DeepSearchProvider.ts +++ b/src/renderer/src/providers/WebSearchProvider/DeepSearchProvider.ts @@ -25,6 +25,41 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { } ] + // 定义URL过滤规则 + private urlFilters = { + // 排除的域名 + excludedDomains: [ + 'login', 'signin', 'signup', 'register', 'account', + 'download', 'shop', 'store', 'buy', 'cart', 'checkout', + 'ads', 'advertisement', 'sponsor', 'tracking', + 'facebook.com', 'twitter.com', 'instagram.com', 'pinterest.com', + 'youtube.com/channel', 'youtube.com/user', 'youtube.com/c/', + 'tiktok.com', 'douyin.com', 'weibo.com', 'zhihu.com/question', + 'baike.baidu.com', 'wiki.com', 'wikipedia.org/wiki/Help:', + 'wikipedia.org/wiki/Wikipedia:', 'wikipedia.org/wiki/Template:', + 'wikipedia.org/wiki/File:', 'wikipedia.org/wiki/Category:', + 'amazon.com/s', 'amazon.cn/s', 'taobao.com/search', 'jd.com/search', + 'tmall.com/search', 'ebay.com/sch', 'aliexpress.com/wholesale' + ], + // 优先的域名(相关性更高) + priorityDomains: [ + 'github.com/augment', 'augmentcode.com', 'augment.dev', + 'github.com', 'stackoverflow.com', 'dev.to', 'medium.com', + 'docs.github.com', 'npmjs.com', 'pypi.org', 'microsoft.com/en-us/learn', + 'developer.mozilla.org', 'w3schools.com', 'reactjs.org', 'vuejs.org', + 'angular.io', 'tensorflow.org', 'pytorch.org', 'kubernetes.io', + 'docker.com', 'aws.amazon.com/documentation', 'cloud.google.com/docs', + 'azure.microsoft.com/en-us/documentation' + ], + // 排除的文件类型 + excludedFileTypes: [ + '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp', + '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.wav', '.ogg', + '.zip', '.rar', '.7z', '.tar', '.gz', '.exe', '.dmg', '.apk', + '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx' + ] + } + // 分析模型配置 private analyzeConfig = { enabled: true, // 是否启用预分析 @@ -198,56 +233,246 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { const batchSize = this.analyzeConfig.batchSize const analyzedResults: AnalyzedResult[] = [...results] // 复制原始结果 - // 简单的分析逻辑:提取前几句作为摘要 + // 预处理查询,提取重要关键词 + const queryWords = query.toLowerCase().split(/\s+/) + + // 过滤掉常见的停用词 + const stopWords = new Set([ + 'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about', 'against', 'between', 'into', 'through', + 'during', 'before', 'after', 'above', 'below', 'from', 'up', 'down', 'of', 'off', 'over', 'under', + 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', + 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', + 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'should', 'now' + ]) + + // 提取重要关键词,并为每个词分配权重 + const keywordWeights = new Map() + queryWords.forEach((word, index) => { + if (word.length > 2 && !stopWords.has(word)) { + // 根据词的位置分配权重,前面的词权重更高 + const positionWeight = 1 - (index / queryWords.length) * 0.5 // 位置权重范围:0.5-1.0 + // 根据词的长度分配权重,更长的词权重更高 + const lengthWeight = Math.min(1, word.length / 10) // 长度权重最高为1 + // 组合权重 + const weight = positionWeight * 0.7 + lengthWeight * 0.3 + keywordWeights.set(word, weight) + } + }) + + // 如果没有提取到关键词,使用原始查询词 + if (keywordWeights.size === 0) { + queryWords.forEach(word => { + if (word.length > 2) { + keywordWeights.set(word, 1.0) + } + }) + } + + // 分析每个结果 for (let i = 0; i < results.length; i++) { const result = results[i] if (result.content === noContent) continue try { - // 提取摘要(简单实现,取前300个字符) + // 提取摘要(改进实现,尝试找到包含关键词的最相关段落) const maxLength = this.analyzeConfig.maxSummaryLength - let summary = result.content.replace(/\n+/g, ' ').replace(/\s+/g, ' ').trim() + const content = result.content.replace(/\n+/g, ' ').replace(/\s+/g, ' ').trim() - if (summary.length > maxLength) { - // 截取到最后一个完整的句子 - summary = summary.substring(0, maxLength) + // 将内容分成句子 + const sentences = content.split(/(?<=[.!?])\s+/) + + // 为每个句子评分 + const sentenceScores = sentences.map(sentence => { + const sentenceLower = sentence.toLowerCase() + let score = 0 + + // 根据句子中包含的关键词计算分数 + for (const [keyword, weight] of keywordWeights.entries()) { + if (sentenceLower.includes(keyword)) { + score += weight + + // 如果关键词在句子开头,给予额外加分 + if (sentenceLower.indexOf(keyword) < 20) { + score += weight * 0.5 + } + } + } + + return { sentence, score } + }) + + // 按分数排序句子 + sentenceScores.sort((a, b) => b.score - a.score) + + // 选择最相关的句子作为摘要 + let summary = '' + let currentLength = 0 + + // 首先添加得分最高的句子 + if (sentenceScores.length > 0 && sentenceScores[0].score > 0) { + summary = sentenceScores[0].sentence + currentLength = summary.length + } + + // 如果还有空间,添加更多相关句子 + for (let j = 1; j < sentenceScores.length && currentLength < maxLength; j++) { + if (sentenceScores[j].score > 0) { + const nextSentence = sentenceScores[j].sentence + if (currentLength + nextSentence.length + 1 <= maxLength) { + summary += ' ' + nextSentence + currentLength += nextSentence.length + 1 + } + } + } + + // 如果没有找到相关句子,回退到简单摘要提取 + if (summary.length === 0) { + summary = content.substring(0, maxLength) const lastPeriod = summary.lastIndexOf('.') if (lastPeriod > maxLength * 0.7) { - // 至少要有总长度的70% summary = summary.substring(0, lastPeriod + 1) } summary += '...' + } else if (summary.length < content.length) { + summary += '...' } - // 提取关键词(简单实现,基于查询词拆分) - const keywords = query - .split(/\s+/) - .filter((word) => word.length > 2 && result.content.toLowerCase().includes(word.toLowerCase())) + // 提取关键词(改进实现) + const contentLower = result.content.toLowerCase() + const keywordScores = new Map() - // 计算相关性评分(简单实现,基于关键词出现频率) - let relevanceScore = 0 - if (keywords.length > 0) { - const contentLower = result.content.toLowerCase() - for (const word of keywords) { - const wordLower = word.toLowerCase() - // 计算关键词出现的次数 - let count = 0 - let pos = contentLower.indexOf(wordLower) - while (pos !== -1) { - count++ - pos = contentLower.indexOf(wordLower, pos + 1) - } - relevanceScore += count + // 从内容中提取潜在关键词 + const contentWords = contentLower.split(/\W+/).filter(word => + word.length > 3 && !stopWords.has(word) + ) + + // 计算词频 + const wordFrequency = new Map() + contentWords.forEach(word => { + wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1) + }) + + // 为每个词评分 + for (const [word, freq] of wordFrequency.entries()) { + // 基础分数是词频 + let score = freq + + // 如果是查询关键词,增加分数 + if (keywordWeights.has(word)) { + score += freq * keywordWeights.get(word)! * 3 } - // 标准化评分,范围为0-1 - relevanceScore = Math.min(1, relevanceScore / (contentLower.length / 100)) + + // 如果在标题中出现,增加分数 + if (result.title.toLowerCase().includes(word)) { + score += 5 + } + + keywordScores.set(word, score) } + // 添加查询关键词(如果内容中包含) + for (const [keyword, weight] of keywordWeights.entries()) { + if (contentLower.includes(keyword) && !keywordScores.has(keyword)) { + keywordScores.set(keyword, weight * 3) + } + } + + // 选择得分最高的关键词 + const sortedKeywords = Array.from(keywordScores.entries()) + .sort((a, b) => b[1] - a[1]) + .slice(0, 5) + .map(entry => entry[0]) + + // 计算相关性评分(改进实现) + let relevanceScore = 0 + + // 1. 基于关键词匹配度的评分 + let keywordMatchScore = 0 + let keywordCount = 0 + + for (const [keyword, weight] of keywordWeights.entries()) { + // 计算关键词出现的次数 + let count = 0 + let pos = contentLower.indexOf(keyword) + while (pos !== -1) { + count++ + pos = contentLower.indexOf(keyword, pos + 1) + } + + if (count > 0) { + keywordCount++ + // 权重 * 出现次数 * 归一化因子 + keywordMatchScore += weight * Math.min(10, count) / 10 + } + } + + // 归一化关键词匹配分数 + if (keywordWeights.size > 0) { + keywordMatchScore = keywordMatchScore / keywordWeights.size + } + + // 2. 基于标题相关性的评分 + let titleScore = 0 + const titleLower = result.title.toLowerCase() + + for (const [keyword, weight] of keywordWeights.entries()) { + if (titleLower.includes(keyword)) { + titleScore += weight + } + } + + // 归一化标题分数 + if (keywordWeights.size > 0) { + titleScore = titleScore / keywordWeights.size + } + + // 3. 基于内容长度的评分(适中长度最佳) + const contentLength = result.content.length + const lengthScore = Math.min(1, contentLength / 2000) * (contentLength < 50000 ? 1 : 0.5) + + // 4. 基于URL的评分(官方网站、知名网站加分) + let urlScore = 0 + const url = result.url.toLowerCase() + + // 首先检查是否有预先计算的优先级分数 + if (result.meta && result.meta.priorityScore) { + // 使用预先计算的分数 + urlScore = result.meta.priorityScore + } else { + // 如果没有预先计算的分数,使用基于域名的评分 + // 检查是否是官方网站或知名网站 + if (url.includes('github.com/augment') || + url.includes('augmentcode.com') || + url.includes('augment.dev')) { + urlScore = 1.0 // 官方网站最高分 + } else if (url.includes('github.com') || + url.includes('stackoverflow.com') || + url.includes('medium.com') || + url.includes('dev.to')) { + urlScore = 0.8 // 知名技术网站高分 + } else if (!url.includes('login') && + !url.includes('signup') && + !url.includes('register')) { + urlScore = 0.5 // 普通网站中等分 + } + } + + // 组合所有评分因素,调整权重以提高URL质量的重要性 + relevanceScore = + keywordMatchScore * 0.4 + // 关键词匹配度占40% + titleScore * 0.3 + // 标题相关性占30% + lengthScore * 0.05 + // 内容长度占5% + urlScore * 0.25 // URL质量占25%,增加了权重 + + // 确保分数在0-1范围内 + relevanceScore = Math.min(1, Math.max(0, relevanceScore)) + // 更新分析结果 analyzedResults[i] = { ...analyzedResults[i], summary, - keywords, + keywords: sortedKeywords, relevanceScore } @@ -267,16 +492,22 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { return scoreB - scoreA }) - console.log(`[DeepSearch] 完成分析 ${results.length} 个结果`) - return analyzedResults + // 过滤掉明显不相关的结果,提高阈值以只保留更相关的结果 + const filteredResults = analyzedResults.filter(result => { + const score = (result as AnalyzedResult).relevanceScore || 0 + return score > 0.2 // 提高阈值到 0.2,只保留相关性分数较高的结果 + }) + + console.log(`[DeepSearch] 完成分析 ${results.length} 个结果,过滤后剩余 ${filteredResults.length} 个结果`) + return filteredResults } /** * 解析搜索结果页面中的URL * 默认实现,子类可以覆盖此方法以适应不同的搜索引擎 */ - protected parseValidUrls(htmlContent: string): Array<{ title: string; url: string }> { - const results: Array<{ title: string; url: string }> = [] + protected parseValidUrls(htmlContent: string): Array<{ title: string; url: string; meta?: Record }> { + const results: Array<{ title: string; url: string; meta?: Record }> = [] try { // 通用解析逻辑,查找所有链接 @@ -288,7 +519,11 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { ...doc.querySelectorAll('#content_left .result h3 a'), ...doc.querySelectorAll('#content_left .c-container h3 a'), ...doc.querySelectorAll('#content_left .c-container a.c-title'), - ...doc.querySelectorAll('#content_left a[data-click]') + ...doc.querySelectorAll('#content_left a[data-click]'), + // 添加更多选择器来适应百度的变化 + ...doc.querySelectorAll('#content_left .result-op a[href*="http"]'), + ...doc.querySelectorAll('#content_left .c-container .t a'), + ...doc.querySelectorAll('#content_left .c-container .c-title-text') ] // 尝试解析Bing搜索结果 - 使用多个选择器来获取更多结果 @@ -296,7 +531,11 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { ...doc.querySelectorAll('.b_algo h2 a'), ...doc.querySelectorAll('.b_algo a.tilk'), ...doc.querySelectorAll('.b_algo a.b_title'), - ...doc.querySelectorAll('.b_results a.b_restorLink') + ...doc.querySelectorAll('.b_results a.b_restorLink'), + // 添加更多选择器来适应Bing的变化 + ...doc.querySelectorAll('.b_algo .b_caption a'), + ...doc.querySelectorAll('.b_algo .b_attribution cite'), + ...doc.querySelectorAll('.b_algo .b_deep a') ] // 尝试解析DuckDuckGo搜索结果 - 使用多个选择器来获取更多结果 @@ -368,19 +607,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { const url = (link as HTMLAnchorElement).href const title = link.textContent || url - // 过滤掉搜索引擎内部链接和重复链接 - if ( - url && - (url.startsWith('http') || url.startsWith('https')) && - !url.includes('google.com/search') && - !url.includes('bing.com/search') && - !url.includes('baidu.com/s?') && - !uniqueUrls.has(url) - ) { + // 使用过滤方法检查URL + if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: title.trim() || url, - url: url + url: url, + // 将优先级分数作为元数据保存 + meta: { priorityScore } }) } } catch (error) { @@ -399,19 +636,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { const url = (link as HTMLAnchorElement).href const title = link.textContent || url - // 过滤掉搜索引擎内部链接和重复链接 - if ( - url && - (url.startsWith('http') || url.startsWith('https')) && - !url.includes('google.com/search') && - !url.includes('bing.com/search') && - !url.includes('baidu.com/s?') && - !uniqueUrls.has(url) - ) { + // 使用过滤方法检查URL + if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: title.trim() || url, - url: url + url: url, + // 将优先级分数作为元数据保存 + meta: { priorityScore } }) } } catch (error) { @@ -430,21 +665,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { const url = (link as HTMLAnchorElement).href const title = link.textContent || url - // 过滤掉搜索引擎内部链接和重复链接 - if ( - url && - (url.startsWith('http') || url.startsWith('https')) && - !url.includes('google.com/search') && - !url.includes('bing.com/search') && - !url.includes('baidu.com/s?') && - !url.includes('sogou.com/web') && - !url.includes('duckduckgo.com/?q=') && - !uniqueUrls.has(url) - ) { + // 使用过滤方法检查URL + if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: title.trim() || url, - url: url + url: url, + // 将优先级分数作为元数据保存 + meta: { priorityScore } }) } } catch (error) { @@ -465,21 +696,25 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { const url = (link as HTMLAnchorElement).href const title = link.textContent || url - // 更宽松的过滤条件 + // 使用过滤方法检查URL,但使用更宽松的条件 if ( url && (url.startsWith('http') || url.startsWith('https')) && - !url.includes('sogou.com/web') && + !url.includes('sogou.com/web') && // 仍然过滤掉搜索引擎内部链接 !url.includes('javascript:') && !url.includes('mailto:') && !url.includes('tel:') && !uniqueUrls.has(url) && title.trim().length > 0 ) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: title.trim() || url, - url: url + url: url, + meta: { priorityScore } }) } } catch (error) { @@ -501,22 +736,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { const url = (link as HTMLAnchorElement).href const title = link.textContent || url - // 过滤掉搜索引擎内部链接和重复链接 - if ( - url && - (url.startsWith('http') || url.startsWith('https')) && - !url.includes('google.com/search') && - !url.includes('bing.com/search') && - !url.includes('baidu.com/s?') && - !url.includes('sogou.com/web') && - !url.includes('duckduckgo.com/?q=') && - !url.includes('searx.tiekoetter.com/search') && - !uniqueUrls.has(url) - ) { + // 使用过滤方法检查URL + if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: title.trim() || url, - url: url + url: url, + // 将优先级分数作为元数据保存 + meta: { priorityScore } }) } } catch (error) { @@ -547,10 +777,14 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { !uniqueUrls.has(url) && title.trim().length > 0 ) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: title.trim() || url, - url: url + url: url, + meta: { priorityScore } }) } } catch (error) { @@ -600,10 +834,14 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { !uniqueUrls.has(url) && title.trim().length > 0 ) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: title.trim() || url, - url: url + url: url, + meta: { priorityScore } }) } } catch (error) { @@ -617,20 +855,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { const url = (link as HTMLAnchorElement).href const title = link.textContent || url - // 过滤掉搜索引擎内部链接和重复链接 - if ( - url && - (url.startsWith('http') || url.startsWith('https')) && - !url.includes('google.com/search') && - !url.includes('bing.com/search') && - !url.includes('baidu.com/s?') && - !url.includes('duckduckgo.com/?q=') && - !uniqueUrls.has(url) - ) { + // 使用过滤方法检查URL + if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: title.trim() || url, - url: url + url: url, + // 将优先级分数作为元数据保存 + meta: { priorityScore } }) } } catch (error) { @@ -661,10 +896,14 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { !url.includes('searx.tiekoetter.com/search') && !uniqueUrls.has(url) ) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: url, - url: url + url: url, + meta: { priorityScore } }) } } @@ -684,31 +923,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { const url = (link as HTMLAnchorElement).href const title = link.textContent || url - // 过滤掉无效链接和搜索引擎内部链接 - if ( - url && - (url.startsWith('http') || url.startsWith('https')) && - !url.includes('google.com/search') && - !url.includes('bing.com/search') && - !url.includes('baidu.com/s?') && - !url.includes('duckduckgo.com/?q=') && - !url.includes('sogou.com/web') && - !url.includes('searx.tiekoetter.com/search') && - !uniqueUrls.has(url) && - // 过滤掉常见的无用链接 - !url.includes('javascript:') && - !url.includes('mailto:') && - !url.includes('tel:') && - !url.includes('login') && - !url.includes('register') && - !url.includes('signup') && - !url.includes('signin') && - title.trim().length > 0 - ) { + // 使用过滤方法检查URL + if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url) && title.trim().length > 0) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + uniqueUrls.add(url) results.push({ title: title.trim(), - url: url + url: url, + // 将优先级分数作为元数据保存 + meta: { priorityScore } }) } } catch (error) { @@ -725,6 +950,81 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { return results } + /** + * 检查URL是否应该被过滤掉 + * @param url 要检查的URL + * @returns 如果应该过滤掉返回true,否则返回false + */ + private shouldFilterUrl(url: string): boolean { + if (!url || !url.startsWith('http')) { + return true + } + + const urlLower = url.toLowerCase() + + // 检查是否是搜索引擎内部链接 + if ( + urlLower.includes('google.com/search') || + urlLower.includes('bing.com/search') || + urlLower.includes('baidu.com/s?') || + urlLower.includes('sogou.com/web') || + urlLower.includes('duckduckgo.com/?q=') || + urlLower.includes('searx.tiekoetter.com/search') + ) { + return true + } + + // 检查是否包含排除的域名 + for (const domain of this.urlFilters.excludedDomains) { + if (urlLower.includes(domain)) { + return true + } + } + + // 检查是否是排除的文件类型 + for (const fileType of this.urlFilters.excludedFileTypes) { + if (urlLower.endsWith(fileType)) { + return true + } + } + + // 检查是否是常见的无用链接 + if ( + urlLower.includes('javascript:') || + urlLower.includes('mailto:') || + urlLower.includes('tel:') || + urlLower.includes('about:') || + urlLower.includes('chrome:') || + urlLower.includes('file:') + ) { + return true + } + + return false + } + + /** + * 计算URL的优先级分数 + * @param url 要计算的URL + * @returns 优先级分数,范围从0到1,越高越优先 + */ + private getUrlPriorityScore(url: string): number { + if (!url) return 0 + + const urlLower = url.toLowerCase() + + // 检查是否是优先域名 + for (let i = 0; i < this.urlFilters.priorityDomains.length; i++) { + const domain = this.urlFilters.priorityDomains[i] + if (urlLower.includes(domain)) { + // 根据域名在数组中的位置计算分数,前面的域名分数更高 + return 1 - (i / this.urlFilters.priorityDomains.length) * 0.5 + } + } + + return 0 + } + /** * 深度抓取内容 * 不仅抓取搜索结果页面,还会抓取页面中的链接 @@ -848,8 +1148,13 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { while ((match = markdownLinkRegex.exec(markdown)) !== null) { const url = match[2] - if (url && (url.startsWith('http') || url.startsWith('https'))) { - urls.add(url) + if (url && (url.startsWith('http') || url.startsWith('https')) && !this.shouldFilterUrl(url)) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + // 优先级较高的URL更有可能被添加 + if (priorityScore > 0.3 || Math.random() < 0.7) { + urls.add(url) + } } } @@ -857,8 +1162,13 @@ export default class DeepSearchProvider extends BaseWebSearchProvider { const urlRegex = /(https?:\/\/[^\s]+)/g while ((match = urlRegex.exec(markdown)) !== null) { const url = match[1] - if (url) { - urls.add(url) + if (url && !this.shouldFilterUrl(url)) { + // 计算URL优先级分数 + const priorityScore = this.getUrlPriorityScore(url) + // 优先级较高的URL更有可能被添加 + if (priorityScore > 0.3 || Math.random() < 0.5) { + urls.add(url) + } } } diff --git a/src/renderer/src/store/websearch.ts b/src/renderer/src/store/websearch.ts index 07bb68f7c0..64029c7ff0 100644 --- a/src/renderer/src/store/websearch.ts +++ b/src/renderer/src/store/websearch.ts @@ -46,9 +46,9 @@ const initialState: WebSearchState = { { id: 'deep-search', name: 'DeepSearch', - description: '多引擎深度搜索', usingBrowser: true, - contentLimit: 10000 + contentLimit: 10000, + description: '多引擎深度搜索' }, { id: 'local-google', diff --git a/src/renderer/src/types/index.ts b/src/renderer/src/types/index.ts index 9ec73dc193..3d19f64706 100644 --- a/src/renderer/src/types/index.ts +++ b/src/renderer/src/types/index.ts @@ -351,6 +351,7 @@ export type WebSearchProvider = { basicAuthPassword?: string contentLimit?: number usingBrowser?: boolean + description?: string } export type WebSearchResponse = { @@ -363,6 +364,7 @@ export type WebSearchResult = { content: string url: string source?: string + meta?: Record // 添加meta字段用于存储元数据 } export type KnowledgeReference = {