升级了下筛选

This commit is contained in:
1600822305 2025-04-24 01:34:35 +08:00
parent d33e16fa81
commit 05d110e4af
4 changed files with 438 additions and 126 deletions

View File

@ -22,9 +22,9 @@ const WebSearchInitializer = () => {
const deepSearchProvider: WebSearchProvider = {
id: 'deep-search',
name: 'DeepSearch',
description: '多引擎深度搜索',
usingBrowser: true,
contentLimit: 10000
contentLimit: 10000,
description: '多引擎深度搜索'
}
dispatch(addWebSearchProvider(deepSearchProvider))
}

View File

@ -25,6 +25,41 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
}
]
// 定义URL过滤规则
private urlFilters = {
// 排除的域名
excludedDomains: [
'login', 'signin', 'signup', 'register', 'account',
'download', 'shop', 'store', 'buy', 'cart', 'checkout',
'ads', 'advertisement', 'sponsor', 'tracking',
'facebook.com', 'twitter.com', 'instagram.com', 'pinterest.com',
'youtube.com/channel', 'youtube.com/user', 'youtube.com/c/',
'tiktok.com', 'douyin.com', 'weibo.com', 'zhihu.com/question',
'baike.baidu.com', 'wiki.com', 'wikipedia.org/wiki/Help:',
'wikipedia.org/wiki/Wikipedia:', 'wikipedia.org/wiki/Template:',
'wikipedia.org/wiki/File:', 'wikipedia.org/wiki/Category:',
'amazon.com/s', 'amazon.cn/s', 'taobao.com/search', 'jd.com/search',
'tmall.com/search', 'ebay.com/sch', 'aliexpress.com/wholesale'
],
// 优先的域名(相关性更高)
priorityDomains: [
'github.com/augment', 'augmentcode.com', 'augment.dev',
'github.com', 'stackoverflow.com', 'dev.to', 'medium.com',
'docs.github.com', 'npmjs.com', 'pypi.org', 'microsoft.com/en-us/learn',
'developer.mozilla.org', 'w3schools.com', 'reactjs.org', 'vuejs.org',
'angular.io', 'tensorflow.org', 'pytorch.org', 'kubernetes.io',
'docker.com', 'aws.amazon.com/documentation', 'cloud.google.com/docs',
'azure.microsoft.com/en-us/documentation'
],
// 排除的文件类型
excludedFileTypes: [
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp',
'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.wav', '.ogg',
'.zip', '.rar', '.7z', '.tar', '.gz', '.exe', '.dmg', '.apk',
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'
]
}
// 分析模型配置
private analyzeConfig = {
enabled: true, // 是否启用预分析
@ -198,56 +233,246 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
const batchSize = this.analyzeConfig.batchSize
const analyzedResults: AnalyzedResult[] = [...results] // 复制原始结果
// 简单的分析逻辑:提取前几句作为摘要
// 预处理查询,提取重要关键词
const queryWords = query.toLowerCase().split(/\s+/)
// 过滤掉常见的停用词
const stopWords = new Set([
'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about', 'against', 'between', 'into', 'through',
'during', 'before', 'after', 'above', 'below', 'from', 'up', 'down', 'of', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'should', 'now'
])
// 提取重要关键词,并为每个词分配权重
const keywordWeights = new Map<string, number>()
queryWords.forEach((word, index) => {
if (word.length > 2 && !stopWords.has(word)) {
// 根据词的位置分配权重,前面的词权重更高
const positionWeight = 1 - (index / queryWords.length) * 0.5 // 位置权重范围0.5-1.0
// 根据词的长度分配权重,更长的词权重更高
const lengthWeight = Math.min(1, word.length / 10) // 长度权重最高为1
// 组合权重
const weight = positionWeight * 0.7 + lengthWeight * 0.3
keywordWeights.set(word, weight)
}
})
// 如果没有提取到关键词,使用原始查询词
if (keywordWeights.size === 0) {
queryWords.forEach(word => {
if (word.length > 2) {
keywordWeights.set(word, 1.0)
}
})
}
// 分析每个结果
for (let i = 0; i < results.length; i++) {
const result = results[i]
if (result.content === noContent) continue
try {
// 提取摘要简单实现取前300个字符
// 提取摘要(改进实现,尝试找到包含关键词的最相关段落
const maxLength = this.analyzeConfig.maxSummaryLength
let summary = result.content.replace(/\n+/g, ' ').replace(/\s+/g, ' ').trim()
const content = result.content.replace(/\n+/g, ' ').replace(/\s+/g, ' ').trim()
if (summary.length > maxLength) {
// 截取到最后一个完整的句子
summary = summary.substring(0, maxLength)
// 将内容分成句子
const sentences = content.split(/(?<=[.!?])\s+/)
// 为每个句子评分
const sentenceScores = sentences.map(sentence => {
const sentenceLower = sentence.toLowerCase()
let score = 0
// 根据句子中包含的关键词计算分数
for (const [keyword, weight] of keywordWeights.entries()) {
if (sentenceLower.includes(keyword)) {
score += weight
// 如果关键词在句子开头,给予额外加分
if (sentenceLower.indexOf(keyword) < 20) {
score += weight * 0.5
}
}
}
return { sentence, score }
})
// 按分数排序句子
sentenceScores.sort((a, b) => b.score - a.score)
// 选择最相关的句子作为摘要
let summary = ''
let currentLength = 0
// 首先添加得分最高的句子
if (sentenceScores.length > 0 && sentenceScores[0].score > 0) {
summary = sentenceScores[0].sentence
currentLength = summary.length
}
// 如果还有空间,添加更多相关句子
for (let j = 1; j < sentenceScores.length && currentLength < maxLength; j++) {
if (sentenceScores[j].score > 0) {
const nextSentence = sentenceScores[j].sentence
if (currentLength + nextSentence.length + 1 <= maxLength) {
summary += ' ' + nextSentence
currentLength += nextSentence.length + 1
}
}
}
// 如果没有找到相关句子,回退到简单摘要提取
if (summary.length === 0) {
summary = content.substring(0, maxLength)
const lastPeriod = summary.lastIndexOf('.')
if (lastPeriod > maxLength * 0.7) {
// 至少要有总长度的70%
summary = summary.substring(0, lastPeriod + 1)
}
summary += '...'
} else if (summary.length < content.length) {
summary += '...'
}
// 提取关键词(简单实现,基于查询词拆分)
const keywords = query
.split(/\s+/)
.filter((word) => word.length > 2 && result.content.toLowerCase().includes(word.toLowerCase()))
// 提取关键词(改进实现)
const contentLower = result.content.toLowerCase()
const keywordScores = new Map<string, number>()
// 计算相关性评分(简单实现,基于关键词出现频率)
let relevanceScore = 0
if (keywords.length > 0) {
const contentLower = result.content.toLowerCase()
for (const word of keywords) {
const wordLower = word.toLowerCase()
// 计算关键词出现的次数
let count = 0
let pos = contentLower.indexOf(wordLower)
while (pos !== -1) {
count++
pos = contentLower.indexOf(wordLower, pos + 1)
}
relevanceScore += count
// 从内容中提取潜在关键词
const contentWords = contentLower.split(/\W+/).filter(word =>
word.length > 3 && !stopWords.has(word)
)
// 计算词频
const wordFrequency = new Map<string, number>()
contentWords.forEach(word => {
wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1)
})
// 为每个词评分
for (const [word, freq] of wordFrequency.entries()) {
// 基础分数是词频
let score = freq
// 如果是查询关键词,增加分数
if (keywordWeights.has(word)) {
score += freq * keywordWeights.get(word)! * 3
}
// 标准化评分范围为0-1
relevanceScore = Math.min(1, relevanceScore / (contentLower.length / 100))
// 如果在标题中出现,增加分数
if (result.title.toLowerCase().includes(word)) {
score += 5
}
keywordScores.set(word, score)
}
// 添加查询关键词(如果内容中包含)
for (const [keyword, weight] of keywordWeights.entries()) {
if (contentLower.includes(keyword) && !keywordScores.has(keyword)) {
keywordScores.set(keyword, weight * 3)
}
}
// 选择得分最高的关键词
const sortedKeywords = Array.from(keywordScores.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 5)
.map(entry => entry[0])
// 计算相关性评分(改进实现)
let relevanceScore = 0
// 1. 基于关键词匹配度的评分
let keywordMatchScore = 0
let keywordCount = 0
for (const [keyword, weight] of keywordWeights.entries()) {
// 计算关键词出现的次数
let count = 0
let pos = contentLower.indexOf(keyword)
while (pos !== -1) {
count++
pos = contentLower.indexOf(keyword, pos + 1)
}
if (count > 0) {
keywordCount++
// 权重 * 出现次数 * 归一化因子
keywordMatchScore += weight * Math.min(10, count) / 10
}
}
// 归一化关键词匹配分数
if (keywordWeights.size > 0) {
keywordMatchScore = keywordMatchScore / keywordWeights.size
}
// 2. 基于标题相关性的评分
let titleScore = 0
const titleLower = result.title.toLowerCase()
for (const [keyword, weight] of keywordWeights.entries()) {
if (titleLower.includes(keyword)) {
titleScore += weight
}
}
// 归一化标题分数
if (keywordWeights.size > 0) {
titleScore = titleScore / keywordWeights.size
}
// 3. 基于内容长度的评分(适中长度最佳)
const contentLength = result.content.length
const lengthScore = Math.min(1, contentLength / 2000) * (contentLength < 50000 ? 1 : 0.5)
// 4. 基于URL的评分官方网站、知名网站加分
let urlScore = 0
const url = result.url.toLowerCase()
// 首先检查是否有预先计算的优先级分数
if (result.meta && result.meta.priorityScore) {
// 使用预先计算的分数
urlScore = result.meta.priorityScore
} else {
// 如果没有预先计算的分数,使用基于域名的评分
// 检查是否是官方网站或知名网站
if (url.includes('github.com/augment') ||
url.includes('augmentcode.com') ||
url.includes('augment.dev')) {
urlScore = 1.0 // 官方网站最高分
} else if (url.includes('github.com') ||
url.includes('stackoverflow.com') ||
url.includes('medium.com') ||
url.includes('dev.to')) {
urlScore = 0.8 // 知名技术网站高分
} else if (!url.includes('login') &&
!url.includes('signup') &&
!url.includes('register')) {
urlScore = 0.5 // 普通网站中等分
}
}
// 组合所有评分因素调整权重以提高URL质量的重要性
relevanceScore =
keywordMatchScore * 0.4 + // 关键词匹配度占40%
titleScore * 0.3 + // 标题相关性占30%
lengthScore * 0.05 + // 内容长度占5%
urlScore * 0.25 // URL质量占25%,增加了权重
// 确保分数在0-1范围内
relevanceScore = Math.min(1, Math.max(0, relevanceScore))
// 更新分析结果
analyzedResults[i] = {
...analyzedResults[i],
summary,
keywords,
keywords: sortedKeywords,
relevanceScore
}
@ -267,16 +492,22 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
return scoreB - scoreA
})
console.log(`[DeepSearch] 完成分析 ${results.length} 个结果`)
return analyzedResults
// 过滤掉明显不相关的结果,提高阈值以只保留更相关的结果
const filteredResults = analyzedResults.filter(result => {
const score = (result as AnalyzedResult).relevanceScore || 0
return score > 0.2 // 提高阈值到 0.2,只保留相关性分数较高的结果
})
console.log(`[DeepSearch] 完成分析 ${results.length} 个结果,过滤后剩余 ${filteredResults.length} 个结果`)
return filteredResults
}
/**
* URL
*
*/
protected parseValidUrls(htmlContent: string): Array<{ title: string; url: string }> {
const results: Array<{ title: string; url: string }> = []
protected parseValidUrls(htmlContent: string): Array<{ title: string; url: string; meta?: Record<string, any> }> {
const results: Array<{ title: string; url: string; meta?: Record<string, any> }> = []
try {
// 通用解析逻辑,查找所有链接
@ -288,7 +519,11 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
...doc.querySelectorAll('#content_left .result h3 a'),
...doc.querySelectorAll('#content_left .c-container h3 a'),
...doc.querySelectorAll('#content_left .c-container a.c-title'),
...doc.querySelectorAll('#content_left a[data-click]')
...doc.querySelectorAll('#content_left a[data-click]'),
// 添加更多选择器来适应百度的变化
...doc.querySelectorAll('#content_left .result-op a[href*="http"]'),
...doc.querySelectorAll('#content_left .c-container .t a'),
...doc.querySelectorAll('#content_left .c-container .c-title-text')
]
// 尝试解析Bing搜索结果 - 使用多个选择器来获取更多结果
@ -296,7 +531,11 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
...doc.querySelectorAll('.b_algo h2 a'),
...doc.querySelectorAll('.b_algo a.tilk'),
...doc.querySelectorAll('.b_algo a.b_title'),
...doc.querySelectorAll('.b_results a.b_restorLink')
...doc.querySelectorAll('.b_results a.b_restorLink'),
// 添加更多选择器来适应Bing的变化
...doc.querySelectorAll('.b_algo .b_caption a'),
...doc.querySelectorAll('.b_algo .b_attribution cite'),
...doc.querySelectorAll('.b_algo .b_deep a')
]
// 尝试解析DuckDuckGo搜索结果 - 使用多个选择器来获取更多结果
@ -368,19 +607,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!uniqueUrls.has(url)
) {
// 使用过滤方法检查URL
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
url: url,
// 将优先级分数作为元数据保存
meta: { priorityScore }
})
}
} catch (error) {
@ -399,19 +636,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!uniqueUrls.has(url)
) {
// 使用过滤方法检查URL
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
url: url,
// 将优先级分数作为元数据保存
meta: { priorityScore }
})
}
} catch (error) {
@ -430,21 +665,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('sogou.com/web') &&
!url.includes('duckduckgo.com/?q=') &&
!uniqueUrls.has(url)
) {
// 使用过滤方法检查URL
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
url: url,
// 将优先级分数作为元数据保存
meta: { priorityScore }
})
}
} catch (error) {
@ -465,21 +696,25 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 更宽松的过滤条件
// 使用过滤方法检查URL但使用更宽松的条件
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('sogou.com/web') &&
!url.includes('sogou.com/web') && // 仍然过滤掉搜索引擎内部链接
!url.includes('javascript:') &&
!url.includes('mailto:') &&
!url.includes('tel:') &&
!uniqueUrls.has(url) &&
title.trim().length > 0
) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
url: url,
meta: { priorityScore }
})
}
} catch (error) {
@ -501,22 +736,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('sogou.com/web') &&
!url.includes('duckduckgo.com/?q=') &&
!url.includes('searx.tiekoetter.com/search') &&
!uniqueUrls.has(url)
) {
// 使用过滤方法检查URL
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
url: url,
// 将优先级分数作为元数据保存
meta: { priorityScore }
})
}
} catch (error) {
@ -547,10 +777,14 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
!uniqueUrls.has(url) &&
title.trim().length > 0
) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
url: url,
meta: { priorityScore }
})
}
} catch (error) {
@ -600,10 +834,14 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
!uniqueUrls.has(url) &&
title.trim().length > 0
) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
url: url,
meta: { priorityScore }
})
}
} catch (error) {
@ -617,20 +855,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('duckduckgo.com/?q=') &&
!uniqueUrls.has(url)
) {
// 使用过滤方法检查URL
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
url: url,
// 将优先级分数作为元数据保存
meta: { priorityScore }
})
}
} catch (error) {
@ -661,10 +896,14 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
!url.includes('searx.tiekoetter.com/search') &&
!uniqueUrls.has(url)
) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: url,
url: url
url: url,
meta: { priorityScore }
})
}
}
@ -684,31 +923,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉无效链接和搜索引擎内部链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('duckduckgo.com/?q=') &&
!url.includes('sogou.com/web') &&
!url.includes('searx.tiekoetter.com/search') &&
!uniqueUrls.has(url) &&
// 过滤掉常见的无用链接
!url.includes('javascript:') &&
!url.includes('mailto:') &&
!url.includes('tel:') &&
!url.includes('login') &&
!url.includes('register') &&
!url.includes('signup') &&
!url.includes('signin') &&
title.trim().length > 0
) {
// 使用过滤方法检查URL
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url) && title.trim().length > 0) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
uniqueUrls.add(url)
results.push({
title: title.trim(),
url: url
url: url,
// 将优先级分数作为元数据保存
meta: { priorityScore }
})
}
} catch (error) {
@ -725,6 +950,81 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
return results
}
/**
* URL是否应该被过滤掉
* @param url URL
* @returns truefalse
*/
private shouldFilterUrl(url: string): boolean {
if (!url || !url.startsWith('http')) {
return true
}
const urlLower = url.toLowerCase()
// 检查是否是搜索引擎内部链接
if (
urlLower.includes('google.com/search') ||
urlLower.includes('bing.com/search') ||
urlLower.includes('baidu.com/s?') ||
urlLower.includes('sogou.com/web') ||
urlLower.includes('duckduckgo.com/?q=') ||
urlLower.includes('searx.tiekoetter.com/search')
) {
return true
}
// 检查是否包含排除的域名
for (const domain of this.urlFilters.excludedDomains) {
if (urlLower.includes(domain)) {
return true
}
}
// 检查是否是排除的文件类型
for (const fileType of this.urlFilters.excludedFileTypes) {
if (urlLower.endsWith(fileType)) {
return true
}
}
// 检查是否是常见的无用链接
if (
urlLower.includes('javascript:') ||
urlLower.includes('mailto:') ||
urlLower.includes('tel:') ||
urlLower.includes('about:') ||
urlLower.includes('chrome:') ||
urlLower.includes('file:')
) {
return true
}
return false
}
/**
* URL的优先级分数
* @param url URL
* @returns 01
*/
private getUrlPriorityScore(url: string): number {
if (!url) return 0
const urlLower = url.toLowerCase()
// 检查是否是优先域名
for (let i = 0; i < this.urlFilters.priorityDomains.length; i++) {
const domain = this.urlFilters.priorityDomains[i]
if (urlLower.includes(domain)) {
// 根据域名在数组中的位置计算分数,前面的域名分数更高
return 1 - (i / this.urlFilters.priorityDomains.length) * 0.5
}
}
return 0
}
/**
*
*
@ -848,8 +1148,13 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
while ((match = markdownLinkRegex.exec(markdown)) !== null) {
const url = match[2]
if (url && (url.startsWith('http') || url.startsWith('https'))) {
urls.add(url)
if (url && (url.startsWith('http') || url.startsWith('https')) && !this.shouldFilterUrl(url)) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
// 优先级较高的URL更有可能被添加
if (priorityScore > 0.3 || Math.random() < 0.7) {
urls.add(url)
}
}
}
@ -857,8 +1162,13 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
const urlRegex = /(https?:\/\/[^\s]+)/g
while ((match = urlRegex.exec(markdown)) !== null) {
const url = match[1]
if (url) {
urls.add(url)
if (url && !this.shouldFilterUrl(url)) {
// 计算URL优先级分数
const priorityScore = this.getUrlPriorityScore(url)
// 优先级较高的URL更有可能被添加
if (priorityScore > 0.3 || Math.random() < 0.5) {
urls.add(url)
}
}
}

View File

@ -46,9 +46,9 @@ const initialState: WebSearchState = {
{
id: 'deep-search',
name: 'DeepSearch',
description: '多引擎深度搜索',
usingBrowser: true,
contentLimit: 10000
contentLimit: 10000,
description: '多引擎深度搜索'
},
{
id: 'local-google',

View File

@ -351,6 +351,7 @@ export type WebSearchProvider = {
basicAuthPassword?: string
contentLimit?: number
usingBrowser?: boolean
description?: string
}
export type WebSearchResponse = {
@ -363,6 +364,7 @@ export type WebSearchResult = {
content: string
url: string
source?: string
meta?: Record<string, any> // 添加meta字段用于存储元数据
}
export type KnowledgeReference = {