mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2025-12-25 11:20:07 +08:00
升级了下筛选
This commit is contained in:
parent
d33e16fa81
commit
05d110e4af
@ -22,9 +22,9 @@ const WebSearchInitializer = () => {
|
||||
const deepSearchProvider: WebSearchProvider = {
|
||||
id: 'deep-search',
|
||||
name: 'DeepSearch',
|
||||
description: '多引擎深度搜索',
|
||||
usingBrowser: true,
|
||||
contentLimit: 10000
|
||||
contentLimit: 10000,
|
||||
description: '多引擎深度搜索'
|
||||
}
|
||||
dispatch(addWebSearchProvider(deepSearchProvider))
|
||||
}
|
||||
|
||||
@ -25,6 +25,41 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
}
|
||||
]
|
||||
|
||||
// 定义URL过滤规则
|
||||
private urlFilters = {
|
||||
// 排除的域名
|
||||
excludedDomains: [
|
||||
'login', 'signin', 'signup', 'register', 'account',
|
||||
'download', 'shop', 'store', 'buy', 'cart', 'checkout',
|
||||
'ads', 'advertisement', 'sponsor', 'tracking',
|
||||
'facebook.com', 'twitter.com', 'instagram.com', 'pinterest.com',
|
||||
'youtube.com/channel', 'youtube.com/user', 'youtube.com/c/',
|
||||
'tiktok.com', 'douyin.com', 'weibo.com', 'zhihu.com/question',
|
||||
'baike.baidu.com', 'wiki.com', 'wikipedia.org/wiki/Help:',
|
||||
'wikipedia.org/wiki/Wikipedia:', 'wikipedia.org/wiki/Template:',
|
||||
'wikipedia.org/wiki/File:', 'wikipedia.org/wiki/Category:',
|
||||
'amazon.com/s', 'amazon.cn/s', 'taobao.com/search', 'jd.com/search',
|
||||
'tmall.com/search', 'ebay.com/sch', 'aliexpress.com/wholesale'
|
||||
],
|
||||
// 优先的域名(相关性更高)
|
||||
priorityDomains: [
|
||||
'github.com/augment', 'augmentcode.com', 'augment.dev',
|
||||
'github.com', 'stackoverflow.com', 'dev.to', 'medium.com',
|
||||
'docs.github.com', 'npmjs.com', 'pypi.org', 'microsoft.com/en-us/learn',
|
||||
'developer.mozilla.org', 'w3schools.com', 'reactjs.org', 'vuejs.org',
|
||||
'angular.io', 'tensorflow.org', 'pytorch.org', 'kubernetes.io',
|
||||
'docker.com', 'aws.amazon.com/documentation', 'cloud.google.com/docs',
|
||||
'azure.microsoft.com/en-us/documentation'
|
||||
],
|
||||
// 排除的文件类型
|
||||
excludedFileTypes: [
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp',
|
||||
'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.wav', '.ogg',
|
||||
'.zip', '.rar', '.7z', '.tar', '.gz', '.exe', '.dmg', '.apk',
|
||||
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'
|
||||
]
|
||||
}
|
||||
|
||||
// 分析模型配置
|
||||
private analyzeConfig = {
|
||||
enabled: true, // 是否启用预分析
|
||||
@ -198,56 +233,246 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
const batchSize = this.analyzeConfig.batchSize
|
||||
const analyzedResults: AnalyzedResult[] = [...results] // 复制原始结果
|
||||
|
||||
// 简单的分析逻辑:提取前几句作为摘要
|
||||
// 预处理查询,提取重要关键词
|
||||
const queryWords = query.toLowerCase().split(/\s+/)
|
||||
|
||||
// 过滤掉常见的停用词
|
||||
const stopWords = new Set([
|
||||
'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
||||
'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about', 'against', 'between', 'into', 'through',
|
||||
'during', 'before', 'after', 'above', 'below', 'from', 'up', 'down', 'of', 'off', 'over', 'under',
|
||||
'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
|
||||
'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
|
||||
'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'should', 'now'
|
||||
])
|
||||
|
||||
// 提取重要关键词,并为每个词分配权重
|
||||
const keywordWeights = new Map<string, number>()
|
||||
queryWords.forEach((word, index) => {
|
||||
if (word.length > 2 && !stopWords.has(word)) {
|
||||
// 根据词的位置分配权重,前面的词权重更高
|
||||
const positionWeight = 1 - (index / queryWords.length) * 0.5 // 位置权重范围:0.5-1.0
|
||||
// 根据词的长度分配权重,更长的词权重更高
|
||||
const lengthWeight = Math.min(1, word.length / 10) // 长度权重最高为1
|
||||
// 组合权重
|
||||
const weight = positionWeight * 0.7 + lengthWeight * 0.3
|
||||
keywordWeights.set(word, weight)
|
||||
}
|
||||
})
|
||||
|
||||
// 如果没有提取到关键词,使用原始查询词
|
||||
if (keywordWeights.size === 0) {
|
||||
queryWords.forEach(word => {
|
||||
if (word.length > 2) {
|
||||
keywordWeights.set(word, 1.0)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// 分析每个结果
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
const result = results[i]
|
||||
if (result.content === noContent) continue
|
||||
|
||||
try {
|
||||
// 提取摘要(简单实现,取前300个字符)
|
||||
// 提取摘要(改进实现,尝试找到包含关键词的最相关段落)
|
||||
const maxLength = this.analyzeConfig.maxSummaryLength
|
||||
let summary = result.content.replace(/\n+/g, ' ').replace(/\s+/g, ' ').trim()
|
||||
const content = result.content.replace(/\n+/g, ' ').replace(/\s+/g, ' ').trim()
|
||||
|
||||
if (summary.length > maxLength) {
|
||||
// 截取到最后一个完整的句子
|
||||
summary = summary.substring(0, maxLength)
|
||||
// 将内容分成句子
|
||||
const sentences = content.split(/(?<=[.!?])\s+/)
|
||||
|
||||
// 为每个句子评分
|
||||
const sentenceScores = sentences.map(sentence => {
|
||||
const sentenceLower = sentence.toLowerCase()
|
||||
let score = 0
|
||||
|
||||
// 根据句子中包含的关键词计算分数
|
||||
for (const [keyword, weight] of keywordWeights.entries()) {
|
||||
if (sentenceLower.includes(keyword)) {
|
||||
score += weight
|
||||
|
||||
// 如果关键词在句子开头,给予额外加分
|
||||
if (sentenceLower.indexOf(keyword) < 20) {
|
||||
score += weight * 0.5
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { sentence, score }
|
||||
})
|
||||
|
||||
// 按分数排序句子
|
||||
sentenceScores.sort((a, b) => b.score - a.score)
|
||||
|
||||
// 选择最相关的句子作为摘要
|
||||
let summary = ''
|
||||
let currentLength = 0
|
||||
|
||||
// 首先添加得分最高的句子
|
||||
if (sentenceScores.length > 0 && sentenceScores[0].score > 0) {
|
||||
summary = sentenceScores[0].sentence
|
||||
currentLength = summary.length
|
||||
}
|
||||
|
||||
// 如果还有空间,添加更多相关句子
|
||||
for (let j = 1; j < sentenceScores.length && currentLength < maxLength; j++) {
|
||||
if (sentenceScores[j].score > 0) {
|
||||
const nextSentence = sentenceScores[j].sentence
|
||||
if (currentLength + nextSentence.length + 1 <= maxLength) {
|
||||
summary += ' ' + nextSentence
|
||||
currentLength += nextSentence.length + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 如果没有找到相关句子,回退到简单摘要提取
|
||||
if (summary.length === 0) {
|
||||
summary = content.substring(0, maxLength)
|
||||
const lastPeriod = summary.lastIndexOf('.')
|
||||
if (lastPeriod > maxLength * 0.7) {
|
||||
// 至少要有总长度的70%
|
||||
summary = summary.substring(0, lastPeriod + 1)
|
||||
}
|
||||
summary += '...'
|
||||
} else if (summary.length < content.length) {
|
||||
summary += '...'
|
||||
}
|
||||
|
||||
// 提取关键词(简单实现,基于查询词拆分)
|
||||
const keywords = query
|
||||
.split(/\s+/)
|
||||
.filter((word) => word.length > 2 && result.content.toLowerCase().includes(word.toLowerCase()))
|
||||
// 提取关键词(改进实现)
|
||||
const contentLower = result.content.toLowerCase()
|
||||
const keywordScores = new Map<string, number>()
|
||||
|
||||
// 计算相关性评分(简单实现,基于关键词出现频率)
|
||||
let relevanceScore = 0
|
||||
if (keywords.length > 0) {
|
||||
const contentLower = result.content.toLowerCase()
|
||||
for (const word of keywords) {
|
||||
const wordLower = word.toLowerCase()
|
||||
// 计算关键词出现的次数
|
||||
let count = 0
|
||||
let pos = contentLower.indexOf(wordLower)
|
||||
while (pos !== -1) {
|
||||
count++
|
||||
pos = contentLower.indexOf(wordLower, pos + 1)
|
||||
}
|
||||
relevanceScore += count
|
||||
// 从内容中提取潜在关键词
|
||||
const contentWords = contentLower.split(/\W+/).filter(word =>
|
||||
word.length > 3 && !stopWords.has(word)
|
||||
)
|
||||
|
||||
// 计算词频
|
||||
const wordFrequency = new Map<string, number>()
|
||||
contentWords.forEach(word => {
|
||||
wordFrequency.set(word, (wordFrequency.get(word) || 0) + 1)
|
||||
})
|
||||
|
||||
// 为每个词评分
|
||||
for (const [word, freq] of wordFrequency.entries()) {
|
||||
// 基础分数是词频
|
||||
let score = freq
|
||||
|
||||
// 如果是查询关键词,增加分数
|
||||
if (keywordWeights.has(word)) {
|
||||
score += freq * keywordWeights.get(word)! * 3
|
||||
}
|
||||
// 标准化评分,范围为0-1
|
||||
relevanceScore = Math.min(1, relevanceScore / (contentLower.length / 100))
|
||||
|
||||
// 如果在标题中出现,增加分数
|
||||
if (result.title.toLowerCase().includes(word)) {
|
||||
score += 5
|
||||
}
|
||||
|
||||
keywordScores.set(word, score)
|
||||
}
|
||||
|
||||
// 添加查询关键词(如果内容中包含)
|
||||
for (const [keyword, weight] of keywordWeights.entries()) {
|
||||
if (contentLower.includes(keyword) && !keywordScores.has(keyword)) {
|
||||
keywordScores.set(keyword, weight * 3)
|
||||
}
|
||||
}
|
||||
|
||||
// 选择得分最高的关键词
|
||||
const sortedKeywords = Array.from(keywordScores.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, 5)
|
||||
.map(entry => entry[0])
|
||||
|
||||
// 计算相关性评分(改进实现)
|
||||
let relevanceScore = 0
|
||||
|
||||
// 1. 基于关键词匹配度的评分
|
||||
let keywordMatchScore = 0
|
||||
let keywordCount = 0
|
||||
|
||||
for (const [keyword, weight] of keywordWeights.entries()) {
|
||||
// 计算关键词出现的次数
|
||||
let count = 0
|
||||
let pos = contentLower.indexOf(keyword)
|
||||
while (pos !== -1) {
|
||||
count++
|
||||
pos = contentLower.indexOf(keyword, pos + 1)
|
||||
}
|
||||
|
||||
if (count > 0) {
|
||||
keywordCount++
|
||||
// 权重 * 出现次数 * 归一化因子
|
||||
keywordMatchScore += weight * Math.min(10, count) / 10
|
||||
}
|
||||
}
|
||||
|
||||
// 归一化关键词匹配分数
|
||||
if (keywordWeights.size > 0) {
|
||||
keywordMatchScore = keywordMatchScore / keywordWeights.size
|
||||
}
|
||||
|
||||
// 2. 基于标题相关性的评分
|
||||
let titleScore = 0
|
||||
const titleLower = result.title.toLowerCase()
|
||||
|
||||
for (const [keyword, weight] of keywordWeights.entries()) {
|
||||
if (titleLower.includes(keyword)) {
|
||||
titleScore += weight
|
||||
}
|
||||
}
|
||||
|
||||
// 归一化标题分数
|
||||
if (keywordWeights.size > 0) {
|
||||
titleScore = titleScore / keywordWeights.size
|
||||
}
|
||||
|
||||
// 3. 基于内容长度的评分(适中长度最佳)
|
||||
const contentLength = result.content.length
|
||||
const lengthScore = Math.min(1, contentLength / 2000) * (contentLength < 50000 ? 1 : 0.5)
|
||||
|
||||
// 4. 基于URL的评分(官方网站、知名网站加分)
|
||||
let urlScore = 0
|
||||
const url = result.url.toLowerCase()
|
||||
|
||||
// 首先检查是否有预先计算的优先级分数
|
||||
if (result.meta && result.meta.priorityScore) {
|
||||
// 使用预先计算的分数
|
||||
urlScore = result.meta.priorityScore
|
||||
} else {
|
||||
// 如果没有预先计算的分数,使用基于域名的评分
|
||||
// 检查是否是官方网站或知名网站
|
||||
if (url.includes('github.com/augment') ||
|
||||
url.includes('augmentcode.com') ||
|
||||
url.includes('augment.dev')) {
|
||||
urlScore = 1.0 // 官方网站最高分
|
||||
} else if (url.includes('github.com') ||
|
||||
url.includes('stackoverflow.com') ||
|
||||
url.includes('medium.com') ||
|
||||
url.includes('dev.to')) {
|
||||
urlScore = 0.8 // 知名技术网站高分
|
||||
} else if (!url.includes('login') &&
|
||||
!url.includes('signup') &&
|
||||
!url.includes('register')) {
|
||||
urlScore = 0.5 // 普通网站中等分
|
||||
}
|
||||
}
|
||||
|
||||
// 组合所有评分因素,调整权重以提高URL质量的重要性
|
||||
relevanceScore =
|
||||
keywordMatchScore * 0.4 + // 关键词匹配度占40%
|
||||
titleScore * 0.3 + // 标题相关性占30%
|
||||
lengthScore * 0.05 + // 内容长度占5%
|
||||
urlScore * 0.25 // URL质量占25%,增加了权重
|
||||
|
||||
// 确保分数在0-1范围内
|
||||
relevanceScore = Math.min(1, Math.max(0, relevanceScore))
|
||||
|
||||
// 更新分析结果
|
||||
analyzedResults[i] = {
|
||||
...analyzedResults[i],
|
||||
summary,
|
||||
keywords,
|
||||
keywords: sortedKeywords,
|
||||
relevanceScore
|
||||
}
|
||||
|
||||
@ -267,16 +492,22 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
return scoreB - scoreA
|
||||
})
|
||||
|
||||
console.log(`[DeepSearch] 完成分析 ${results.length} 个结果`)
|
||||
return analyzedResults
|
||||
// 过滤掉明显不相关的结果,提高阈值以只保留更相关的结果
|
||||
const filteredResults = analyzedResults.filter(result => {
|
||||
const score = (result as AnalyzedResult).relevanceScore || 0
|
||||
return score > 0.2 // 提高阈值到 0.2,只保留相关性分数较高的结果
|
||||
})
|
||||
|
||||
console.log(`[DeepSearch] 完成分析 ${results.length} 个结果,过滤后剩余 ${filteredResults.length} 个结果`)
|
||||
return filteredResults
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析搜索结果页面中的URL
|
||||
* 默认实现,子类可以覆盖此方法以适应不同的搜索引擎
|
||||
*/
|
||||
protected parseValidUrls(htmlContent: string): Array<{ title: string; url: string }> {
|
||||
const results: Array<{ title: string; url: string }> = []
|
||||
protected parseValidUrls(htmlContent: string): Array<{ title: string; url: string; meta?: Record<string, any> }> {
|
||||
const results: Array<{ title: string; url: string; meta?: Record<string, any> }> = []
|
||||
|
||||
try {
|
||||
// 通用解析逻辑,查找所有链接
|
||||
@ -288,7 +519,11 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
...doc.querySelectorAll('#content_left .result h3 a'),
|
||||
...doc.querySelectorAll('#content_left .c-container h3 a'),
|
||||
...doc.querySelectorAll('#content_left .c-container a.c-title'),
|
||||
...doc.querySelectorAll('#content_left a[data-click]')
|
||||
...doc.querySelectorAll('#content_left a[data-click]'),
|
||||
// 添加更多选择器来适应百度的变化
|
||||
...doc.querySelectorAll('#content_left .result-op a[href*="http"]'),
|
||||
...doc.querySelectorAll('#content_left .c-container .t a'),
|
||||
...doc.querySelectorAll('#content_left .c-container .c-title-text')
|
||||
]
|
||||
|
||||
// 尝试解析Bing搜索结果 - 使用多个选择器来获取更多结果
|
||||
@ -296,7 +531,11 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
...doc.querySelectorAll('.b_algo h2 a'),
|
||||
...doc.querySelectorAll('.b_algo a.tilk'),
|
||||
...doc.querySelectorAll('.b_algo a.b_title'),
|
||||
...doc.querySelectorAll('.b_results a.b_restorLink')
|
||||
...doc.querySelectorAll('.b_results a.b_restorLink'),
|
||||
// 添加更多选择器来适应Bing的变化
|
||||
...doc.querySelectorAll('.b_algo .b_caption a'),
|
||||
...doc.querySelectorAll('.b_algo .b_attribution cite'),
|
||||
...doc.querySelectorAll('.b_algo .b_deep a')
|
||||
]
|
||||
|
||||
// 尝试解析DuckDuckGo搜索结果 - 使用多个选择器来获取更多结果
|
||||
@ -368,19 +607,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
// 使用过滤方法检查URL
|
||||
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
url: url,
|
||||
// 将优先级分数作为元数据保存
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
@ -399,19 +636,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
// 使用过滤方法检查URL
|
||||
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
url: url,
|
||||
// 将优先级分数作为元数据保存
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
@ -430,21 +665,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('sogou.com/web') &&
|
||||
!url.includes('duckduckgo.com/?q=') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
// 使用过滤方法检查URL
|
||||
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
url: url,
|
||||
// 将优先级分数作为元数据保存
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
@ -465,21 +696,25 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 更宽松的过滤条件
|
||||
// 使用过滤方法检查URL,但使用更宽松的条件
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('sogou.com/web') &&
|
||||
!url.includes('sogou.com/web') && // 仍然过滤掉搜索引擎内部链接
|
||||
!url.includes('javascript:') &&
|
||||
!url.includes('mailto:') &&
|
||||
!url.includes('tel:') &&
|
||||
!uniqueUrls.has(url) &&
|
||||
title.trim().length > 0
|
||||
) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
url: url,
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
@ -501,22 +736,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('sogou.com/web') &&
|
||||
!url.includes('duckduckgo.com/?q=') &&
|
||||
!url.includes('searx.tiekoetter.com/search') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
// 使用过滤方法检查URL
|
||||
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
url: url,
|
||||
// 将优先级分数作为元数据保存
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
@ -547,10 +777,14 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
!uniqueUrls.has(url) &&
|
||||
title.trim().length > 0
|
||||
) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
url: url,
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
@ -600,10 +834,14 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
!uniqueUrls.has(url) &&
|
||||
title.trim().length > 0
|
||||
) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
url: url,
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
@ -617,20 +855,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('duckduckgo.com/?q=') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
// 使用过滤方法检查URL
|
||||
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url)) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
url: url,
|
||||
// 将优先级分数作为元数据保存
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
@ -661,10 +896,14 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
!url.includes('searx.tiekoetter.com/search') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: url,
|
||||
url: url
|
||||
url: url,
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -684,31 +923,17 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉无效链接和搜索引擎内部链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('duckduckgo.com/?q=') &&
|
||||
!url.includes('sogou.com/web') &&
|
||||
!url.includes('searx.tiekoetter.com/search') &&
|
||||
!uniqueUrls.has(url) &&
|
||||
// 过滤掉常见的无用链接
|
||||
!url.includes('javascript:') &&
|
||||
!url.includes('mailto:') &&
|
||||
!url.includes('tel:') &&
|
||||
!url.includes('login') &&
|
||||
!url.includes('register') &&
|
||||
!url.includes('signup') &&
|
||||
!url.includes('signin') &&
|
||||
title.trim().length > 0
|
||||
) {
|
||||
// 使用过滤方法检查URL
|
||||
if (url && !this.shouldFilterUrl(url) && !uniqueUrls.has(url) && title.trim().length > 0) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim(),
|
||||
url: url
|
||||
url: url,
|
||||
// 将优先级分数作为元数据保存
|
||||
meta: { priorityScore }
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
@ -725,6 +950,81 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
return results
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查URL是否应该被过滤掉
|
||||
* @param url 要检查的URL
|
||||
* @returns 如果应该过滤掉返回true,否则返回false
|
||||
*/
|
||||
private shouldFilterUrl(url: string): boolean {
|
||||
if (!url || !url.startsWith('http')) {
|
||||
return true
|
||||
}
|
||||
|
||||
const urlLower = url.toLowerCase()
|
||||
|
||||
// 检查是否是搜索引擎内部链接
|
||||
if (
|
||||
urlLower.includes('google.com/search') ||
|
||||
urlLower.includes('bing.com/search') ||
|
||||
urlLower.includes('baidu.com/s?') ||
|
||||
urlLower.includes('sogou.com/web') ||
|
||||
urlLower.includes('duckduckgo.com/?q=') ||
|
||||
urlLower.includes('searx.tiekoetter.com/search')
|
||||
) {
|
||||
return true
|
||||
}
|
||||
|
||||
// 检查是否包含排除的域名
|
||||
for (const domain of this.urlFilters.excludedDomains) {
|
||||
if (urlLower.includes(domain)) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// 检查是否是排除的文件类型
|
||||
for (const fileType of this.urlFilters.excludedFileTypes) {
|
||||
if (urlLower.endsWith(fileType)) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// 检查是否是常见的无用链接
|
||||
if (
|
||||
urlLower.includes('javascript:') ||
|
||||
urlLower.includes('mailto:') ||
|
||||
urlLower.includes('tel:') ||
|
||||
urlLower.includes('about:') ||
|
||||
urlLower.includes('chrome:') ||
|
||||
urlLower.includes('file:')
|
||||
) {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算URL的优先级分数
|
||||
* @param url 要计算的URL
|
||||
* @returns 优先级分数,范围从0到1,越高越优先
|
||||
*/
|
||||
private getUrlPriorityScore(url: string): number {
|
||||
if (!url) return 0
|
||||
|
||||
const urlLower = url.toLowerCase()
|
||||
|
||||
// 检查是否是优先域名
|
||||
for (let i = 0; i < this.urlFilters.priorityDomains.length; i++) {
|
||||
const domain = this.urlFilters.priorityDomains[i]
|
||||
if (urlLower.includes(domain)) {
|
||||
// 根据域名在数组中的位置计算分数,前面的域名分数更高
|
||||
return 1 - (i / this.urlFilters.priorityDomains.length) * 0.5
|
||||
}
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
/**
|
||||
* 深度抓取内容
|
||||
* 不仅抓取搜索结果页面,还会抓取页面中的链接
|
||||
@ -848,8 +1148,13 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
|
||||
while ((match = markdownLinkRegex.exec(markdown)) !== null) {
|
||||
const url = match[2]
|
||||
if (url && (url.startsWith('http') || url.startsWith('https'))) {
|
||||
urls.add(url)
|
||||
if (url && (url.startsWith('http') || url.startsWith('https')) && !this.shouldFilterUrl(url)) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
// 优先级较高的URL更有可能被添加
|
||||
if (priorityScore > 0.3 || Math.random() < 0.7) {
|
||||
urls.add(url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -857,8 +1162,13 @@ export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
const urlRegex = /(https?:\/\/[^\s]+)/g
|
||||
while ((match = urlRegex.exec(markdown)) !== null) {
|
||||
const url = match[1]
|
||||
if (url) {
|
||||
urls.add(url)
|
||||
if (url && !this.shouldFilterUrl(url)) {
|
||||
// 计算URL优先级分数
|
||||
const priorityScore = this.getUrlPriorityScore(url)
|
||||
// 优先级较高的URL更有可能被添加
|
||||
if (priorityScore > 0.3 || Math.random() < 0.5) {
|
||||
urls.add(url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -46,9 +46,9 @@ const initialState: WebSearchState = {
|
||||
{
|
||||
id: 'deep-search',
|
||||
name: 'DeepSearch',
|
||||
description: '多引擎深度搜索',
|
||||
usingBrowser: true,
|
||||
contentLimit: 10000
|
||||
contentLimit: 10000,
|
||||
description: '多引擎深度搜索'
|
||||
},
|
||||
{
|
||||
id: 'local-google',
|
||||
|
||||
@ -351,6 +351,7 @@ export type WebSearchProvider = {
|
||||
basicAuthPassword?: string
|
||||
contentLimit?: number
|
||||
usingBrowser?: boolean
|
||||
description?: string
|
||||
}
|
||||
|
||||
export type WebSearchResponse = {
|
||||
@ -363,6 +364,7 @@ export type WebSearchResult = {
|
||||
content: string
|
||||
url: string
|
||||
source?: string
|
||||
meta?: Record<string, any> // 添加meta字段用于存储元数据
|
||||
}
|
||||
|
||||
export type KnowledgeReference = {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user