mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2025-12-25 19:30:17 +08:00
deepsearch
This commit is contained in:
parent
1b2d15f2e8
commit
d33e16fa81
@ -6,7 +6,9 @@ import { HashRouter, Route, Routes } from 'react-router-dom'
|
||||
import { PersistGate } from 'redux-persist/integration/react'
|
||||
|
||||
import Sidebar from './components/app/Sidebar'
|
||||
import GeminiInitializer from './components/GeminiInitializer'
|
||||
import TopViewContainer from './components/TopView'
|
||||
import WebSearchInitializer from './components/WebSearchInitializer'
|
||||
import AntdProvider from './context/AntdProvider'
|
||||
import StyleSheetManager from './context/StyleSheetManager'
|
||||
import { SyntaxHighlighterProvider } from './context/SyntaxHighlighterProvider'
|
||||
@ -29,6 +31,8 @@ function App(): React.ReactElement {
|
||||
<AntdProvider>
|
||||
<SyntaxHighlighterProvider>
|
||||
<PersistGate loading={null} persistor={persistor}>
|
||||
<GeminiInitializer />
|
||||
<WebSearchInitializer />
|
||||
<TopViewContainer>
|
||||
<HashRouter>
|
||||
<NavigationHandler />
|
||||
|
||||
35
src/renderer/src/components/GeminiInitializer.tsx
Normal file
35
src/renderer/src/components/GeminiInitializer.tsx
Normal file
@ -0,0 +1,35 @@
|
||||
import { useEffect } from 'react'
|
||||
import { useDispatch, useSelector } from 'react-redux'
|
||||
import { RootState } from '@renderer/store'
|
||||
import { updateProvider } from '@renderer/store/llm'
|
||||
|
||||
/**
|
||||
* GeminiInitializer组件
|
||||
* 用于在应用启动时检查Gemini API的配置
|
||||
* 如果没有配置API密钥,则禁用Gemini API
|
||||
*/
|
||||
const GeminiInitializer = () => {
|
||||
const dispatch = useDispatch()
|
||||
const providers = useSelector((state: RootState) => state.llm.providers)
|
||||
|
||||
useEffect(() => {
|
||||
// 检查Gemini提供商
|
||||
const geminiProvider = providers.find((provider) => provider.id === 'gemini')
|
||||
|
||||
// 如果Gemini提供商存在且已启用,但没有API密钥,则禁用它
|
||||
if (geminiProvider && geminiProvider.enabled && !geminiProvider.apiKey) {
|
||||
dispatch(
|
||||
updateProvider({
|
||||
...geminiProvider,
|
||||
enabled: false
|
||||
})
|
||||
)
|
||||
console.log('Gemini API disabled due to missing API key')
|
||||
}
|
||||
}, [dispatch, providers])
|
||||
|
||||
// 这是一个初始化组件,不需要渲染任何UI
|
||||
return null
|
||||
}
|
||||
|
||||
export default GeminiInitializer
|
||||
37
src/renderer/src/components/WebSearchInitializer.tsx
Normal file
37
src/renderer/src/components/WebSearchInitializer.tsx
Normal file
@ -0,0 +1,37 @@
|
||||
import { useEffect } from 'react'
|
||||
import { useDispatch, useSelector } from 'react-redux'
|
||||
import { RootState } from '@renderer/store'
|
||||
import { addWebSearchProvider } from '@renderer/store/websearch'
|
||||
import { WebSearchProvider } from '@renderer/types'
|
||||
|
||||
/**
|
||||
* WebSearchInitializer组件
|
||||
* 用于在应用启动时初始化WebSearchService
|
||||
* 确保DeepSearch在应用启动时被正确设置
|
||||
*/
|
||||
const WebSearchInitializer = () => {
|
||||
const dispatch = useDispatch()
|
||||
const providers = useSelector((state: RootState) => state.websearch.providers)
|
||||
|
||||
useEffect(() => {
|
||||
// 检查是否已经存在DeepSearch提供商
|
||||
const hasDeepSearch = providers.some((provider) => provider.id === 'deep-search')
|
||||
|
||||
// 如果不存在,添加DeepSearch提供商
|
||||
if (!hasDeepSearch) {
|
||||
const deepSearchProvider: WebSearchProvider = {
|
||||
id: 'deep-search',
|
||||
name: 'DeepSearch',
|
||||
description: '多引擎深度搜索',
|
||||
usingBrowser: true,
|
||||
contentLimit: 10000
|
||||
}
|
||||
dispatch(addWebSearchProvider(deepSearchProvider))
|
||||
}
|
||||
}, [dispatch, providers])
|
||||
|
||||
// 这是一个初始化组件,不需要渲染任何UI
|
||||
return null
|
||||
}
|
||||
|
||||
export default WebSearchInitializer
|
||||
@ -0,0 +1,867 @@
|
||||
import { nanoid } from '@reduxjs/toolkit'
|
||||
import { WebSearchState } from '@renderer/store/websearch'
|
||||
import { WebSearchProvider, WebSearchResponse, WebSearchResult } from '@renderer/types'
|
||||
import { fetchWebContent, noContent } from '@renderer/utils/fetch'
|
||||
|
||||
// 定义分析结果类型
|
||||
interface AnalyzedResult extends WebSearchResult {
|
||||
summary?: string // 内容摘要
|
||||
keywords?: string[] // 关键词
|
||||
relevanceScore?: number // 相关性评分
|
||||
}
|
||||
|
||||
import BaseWebSearchProvider from './BaseWebSearchProvider'
|
||||
|
||||
export default class DeepSearchProvider extends BaseWebSearchProvider {
|
||||
// 定义默认的搜索引擎URLs
|
||||
private searchEngines = [
|
||||
{ name: 'Baidu', url: 'https://www.baidu.com/s?wd=%s' },
|
||||
{ name: 'Bing', url: 'https://cn.bing.com/search?q=%s&ensearch=1' },
|
||||
{ name: 'DuckDuckGo', url: 'https://duckduckgo.com/?q=%s&t=h_' },
|
||||
{ name: 'Sogou', url: 'https://www.sogou.com/web?query=%s' },
|
||||
{
|
||||
name: 'SearX',
|
||||
url: 'https://searx.tiekoetter.com/search?q=%s&categories=general&language=auto&time_range=&safesearch=0&theme=simple'
|
||||
}
|
||||
]
|
||||
|
||||
// 分析模型配置
|
||||
private analyzeConfig = {
|
||||
enabled: true, // 是否启用预分析
|
||||
maxSummaryLength: 300, // 每个结果的摘要最大长度
|
||||
batchSize: 3 // 每批分析的结果数量
|
||||
}
|
||||
|
||||
constructor(provider: WebSearchProvider) {
|
||||
super(provider)
|
||||
// 不再强制要求provider.url,因为我们有默认的搜索引擎
|
||||
}
|
||||
|
||||
public async search(query: string, websearch: WebSearchState): Promise<WebSearchResponse> {
|
||||
try {
|
||||
if (!query.trim()) {
|
||||
throw new Error('Search query cannot be empty')
|
||||
}
|
||||
|
||||
const cleanedQuery = query.split('\r\n')[1] ?? query
|
||||
console.log(`[DeepSearch] 开始多引擎并行搜索: ${cleanedQuery}`)
|
||||
|
||||
// 存储所有搜索引擎的结果
|
||||
const allItems: Array<{ title: string; url: string; source: string }> = []
|
||||
|
||||
// 并行搜索所有引擎
|
||||
const searchPromises = this.searchEngines.map(async (engine) => {
|
||||
try {
|
||||
const uid = `deep-search-${engine.name.toLowerCase()}-${nanoid()}`
|
||||
const url = engine.url.replace('%s', encodeURIComponent(cleanedQuery))
|
||||
|
||||
console.log(`[DeepSearch] 使用${engine.name}搜索: ${url}`)
|
||||
|
||||
// 使用搜索窗口获取搜索结果页面内容
|
||||
const content = await window.api.searchService.openUrlInSearchWindow(uid, url)
|
||||
|
||||
// 解析搜索结果页面中的URL
|
||||
const searchItems = this.parseValidUrls(content)
|
||||
console.log(`[DeepSearch] ${engine.name}找到 ${searchItems.length} 个结果`)
|
||||
|
||||
// 添加搜索引擎标记
|
||||
return searchItems.map((item) => ({
|
||||
...item,
|
||||
source: engine.name
|
||||
}))
|
||||
} catch (engineError) {
|
||||
console.error(`[DeepSearch] ${engine.name}搜索失败:`, engineError)
|
||||
// 如果失败返回空数组
|
||||
return []
|
||||
}
|
||||
})
|
||||
|
||||
// 如果用户在provider中指定了URL,也并行搜索
|
||||
if (this.provider.url) {
|
||||
searchPromises.push(
|
||||
(async () => {
|
||||
try {
|
||||
const uid = `deep-search-custom-${nanoid()}`
|
||||
const url = this.provider.url ? this.provider.url.replace('%s', encodeURIComponent(cleanedQuery)) : ''
|
||||
|
||||
console.log(`[DeepSearch] 使用自定义搜索: ${url}`)
|
||||
|
||||
// 使用搜索窗口获取搜索结果页面内容
|
||||
const content = await window.api.searchService.openUrlInSearchWindow(uid, url)
|
||||
|
||||
// 解析搜索结果页面中的URL
|
||||
const searchItems = this.parseValidUrls(content)
|
||||
console.log(`[DeepSearch] 自定义搜索找到 ${searchItems.length} 个结果`)
|
||||
|
||||
// 添加搜索引擎标记
|
||||
return searchItems.map((item) => ({
|
||||
...item,
|
||||
source: '自定义'
|
||||
}))
|
||||
} catch (customError) {
|
||||
console.error('[DeepSearch] 自定义搜索失败:', customError)
|
||||
return []
|
||||
}
|
||||
})()
|
||||
)
|
||||
}
|
||||
|
||||
// 等待所有搜索完成
|
||||
const searchResults = await Promise.all(searchPromises)
|
||||
|
||||
// 合并所有搜索结果
|
||||
for (const results of searchResults) {
|
||||
allItems.push(...results)
|
||||
}
|
||||
|
||||
console.log(`[DeepSearch] 总共找到 ${allItems.length} 个结果`)
|
||||
|
||||
// 去重,使用URL作为唯一标识
|
||||
const uniqueUrls = new Set<string>()
|
||||
const uniqueItems = allItems.filter((item) => {
|
||||
if (uniqueUrls.has(item.url)) {
|
||||
return false
|
||||
}
|
||||
uniqueUrls.add(item.url)
|
||||
return true
|
||||
})
|
||||
|
||||
console.log(`[DeepSearch] 去重后有 ${uniqueItems.length} 个结果`)
|
||||
|
||||
// 过滤有效的URL,不限制数量
|
||||
const validItems = uniqueItems.filter((item) => item.url.startsWith('http') || item.url.startsWith('https'))
|
||||
|
||||
console.log(`[DeepSearch] 过滤后有 ${validItems.length} 个有效结果`)
|
||||
|
||||
// 第二步:抓取每个URL的内容
|
||||
const results = await this.fetchContentsWithDepth(validItems, websearch)
|
||||
|
||||
// 如果启用了预分析,对结果进行分析
|
||||
let analyzedResults = results
|
||||
if (this.analyzeConfig.enabled) {
|
||||
analyzedResults = await this.analyzeResults(results, cleanedQuery)
|
||||
}
|
||||
|
||||
// 在标题中添加搜索引擎来源和摘要
|
||||
const resultsWithSource = analyzedResults.map((result, index) => {
|
||||
if (index < validItems.length) {
|
||||
// 如果有摘要,在内容前面添加摘要
|
||||
let enhancedContent = result.content
|
||||
const summary = (result as AnalyzedResult).summary
|
||||
|
||||
if (summary && summary !== enhancedContent.substring(0, summary.length)) {
|
||||
enhancedContent = `**摘要**: ${summary}\n\n---\n\n${enhancedContent}`
|
||||
}
|
||||
|
||||
// 如果有关键词,在内容前面添加关键词
|
||||
const keywords = (result as AnalyzedResult).keywords
|
||||
if (keywords && keywords.length > 0) {
|
||||
enhancedContent = `**关键词**: ${keywords.join(', ')}\n\n${enhancedContent}`
|
||||
}
|
||||
|
||||
return {
|
||||
...result,
|
||||
title: `[${validItems[index].source}] ${result.title}`,
|
||||
content: enhancedContent
|
||||
}
|
||||
}
|
||||
return result
|
||||
})
|
||||
|
||||
// 按相关性排序
|
||||
const sortedResults = [...resultsWithSource].sort((a, b) => {
|
||||
const scoreA = (a as AnalyzedResult).relevanceScore || 0
|
||||
const scoreB = (b as AnalyzedResult).relevanceScore || 0
|
||||
return scoreB - scoreA
|
||||
})
|
||||
|
||||
return {
|
||||
query: query,
|
||||
results: sortedResults.filter((result) => result.content !== noContent)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('[DeepSearch] 搜索失败:', error)
|
||||
throw new Error(`DeepSearch failed: ${error instanceof Error ? error.message : 'Unknown error'}`)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 分析搜索结果,提取摘要和关键词
|
||||
* @param results 搜索结果
|
||||
* @param query 搜索查询
|
||||
* @returns 分析后的结果
|
||||
*/
|
||||
private async analyzeResults(results: WebSearchResult[], query: string): Promise<AnalyzedResult[]> {
|
||||
console.log(`[DeepSearch] 开始分析 ${results.length} 个结果`)
|
||||
|
||||
// 分批处理,避免处理过多内容
|
||||
const batchSize = this.analyzeConfig.batchSize
|
||||
const analyzedResults: AnalyzedResult[] = [...results] // 复制原始结果
|
||||
|
||||
// 简单的分析逻辑:提取前几句作为摘要
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
const result = results[i]
|
||||
if (result.content === noContent) continue
|
||||
|
||||
try {
|
||||
// 提取摘要(简单实现,取前300个字符)
|
||||
const maxLength = this.analyzeConfig.maxSummaryLength
|
||||
let summary = result.content.replace(/\n+/g, ' ').replace(/\s+/g, ' ').trim()
|
||||
|
||||
if (summary.length > maxLength) {
|
||||
// 截取到最后一个完整的句子
|
||||
summary = summary.substring(0, maxLength)
|
||||
const lastPeriod = summary.lastIndexOf('.')
|
||||
if (lastPeriod > maxLength * 0.7) {
|
||||
// 至少要有总长度的70%
|
||||
summary = summary.substring(0, lastPeriod + 1)
|
||||
}
|
||||
summary += '...'
|
||||
}
|
||||
|
||||
// 提取关键词(简单实现,基于查询词拆分)
|
||||
const keywords = query
|
||||
.split(/\s+/)
|
||||
.filter((word) => word.length > 2 && result.content.toLowerCase().includes(word.toLowerCase()))
|
||||
|
||||
// 计算相关性评分(简单实现,基于关键词出现频率)
|
||||
let relevanceScore = 0
|
||||
if (keywords.length > 0) {
|
||||
const contentLower = result.content.toLowerCase()
|
||||
for (const word of keywords) {
|
||||
const wordLower = word.toLowerCase()
|
||||
// 计算关键词出现的次数
|
||||
let count = 0
|
||||
let pos = contentLower.indexOf(wordLower)
|
||||
while (pos !== -1) {
|
||||
count++
|
||||
pos = contentLower.indexOf(wordLower, pos + 1)
|
||||
}
|
||||
relevanceScore += count
|
||||
}
|
||||
// 标准化评分,范围为0-1
|
||||
relevanceScore = Math.min(1, relevanceScore / (contentLower.length / 100))
|
||||
}
|
||||
|
||||
// 更新分析结果
|
||||
analyzedResults[i] = {
|
||||
...analyzedResults[i],
|
||||
summary,
|
||||
keywords,
|
||||
relevanceScore
|
||||
}
|
||||
|
||||
// 每处理一批打印一次日志
|
||||
if (i % batchSize === 0 || i === results.length - 1) {
|
||||
console.log(`[DeepSearch] 已分析 ${i + 1}/${results.length} 个结果`)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`[DeepSearch] 分析结果 ${i} 失败:`, error)
|
||||
}
|
||||
}
|
||||
|
||||
// 按相关性排序
|
||||
analyzedResults.sort((a, b) => {
|
||||
const scoreA = (a as AnalyzedResult).relevanceScore || 0
|
||||
const scoreB = (b as AnalyzedResult).relevanceScore || 0
|
||||
return scoreB - scoreA
|
||||
})
|
||||
|
||||
console.log(`[DeepSearch] 完成分析 ${results.length} 个结果`)
|
||||
return analyzedResults
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析搜索结果页面中的URL
|
||||
* 默认实现,子类可以覆盖此方法以适应不同的搜索引擎
|
||||
*/
|
||||
protected parseValidUrls(htmlContent: string): Array<{ title: string; url: string }> {
|
||||
const results: Array<{ title: string; url: string }> = []
|
||||
|
||||
try {
|
||||
// 通用解析逻辑,查找所有链接
|
||||
const parser = new DOMParser()
|
||||
const doc = parser.parseFromString(htmlContent, 'text/html')
|
||||
|
||||
// 尝试解析Baidu搜索结果 - 使用多个选择器来获取更多结果
|
||||
const baiduResults = [
|
||||
...doc.querySelectorAll('#content_left .result h3 a'),
|
||||
...doc.querySelectorAll('#content_left .c-container h3 a'),
|
||||
...doc.querySelectorAll('#content_left .c-container a.c-title'),
|
||||
...doc.querySelectorAll('#content_left a[data-click]')
|
||||
]
|
||||
|
||||
// 尝试解析Bing搜索结果 - 使用多个选择器来获取更多结果
|
||||
const bingResults = [
|
||||
...doc.querySelectorAll('.b_algo h2 a'),
|
||||
...doc.querySelectorAll('.b_algo a.tilk'),
|
||||
...doc.querySelectorAll('.b_algo a.b_title'),
|
||||
...doc.querySelectorAll('.b_results a.b_restorLink')
|
||||
]
|
||||
|
||||
// 尝试解析DuckDuckGo搜索结果 - 使用多个选择器来获取更多结果
|
||||
// 注意:DuckDuckGo的DOM结构可能会变化,所以我们使用多种选择器
|
||||
const duckduckgoResults = [
|
||||
// 标准结果选择器
|
||||
...doc.querySelectorAll('.result__a'), // 主要结果链接
|
||||
...doc.querySelectorAll('.result__url'), // URL链接
|
||||
...doc.querySelectorAll('.result__snippet a'), // 片段中的链接
|
||||
...doc.querySelectorAll('.results_links_deep a'), // 深度链接
|
||||
|
||||
// 新的选择器,适应可能的DOM变化
|
||||
...doc.querySelectorAll('a.result__check'), // 可能的新结果链接
|
||||
...doc.querySelectorAll('a.js-result-title-link'), // 可能的标题链接
|
||||
...doc.querySelectorAll('article a'), // 文章中的链接
|
||||
...doc.querySelectorAll('.nrn-react-div a'), // React渲染的链接
|
||||
|
||||
// 通用选择器,捕获更多可能的结果
|
||||
...doc.querySelectorAll('a[href*="http"]'), // 所有外部链接
|
||||
...doc.querySelectorAll('a[data-testid]'), // 所有测试ID链接
|
||||
...doc.querySelectorAll('.module a') // 模块中的链接
|
||||
]
|
||||
|
||||
// 尝试解析搜狗搜索结果 - 使用多个选择器来获取更多结果
|
||||
const sogouResults = [
|
||||
// 标准结果选择器
|
||||
...doc.querySelectorAll('.vrwrap h3 a'), // 主要结果链接
|
||||
...doc.querySelectorAll('.vr-title a'), // 标题链接
|
||||
...doc.querySelectorAll('.citeurl a'), // 引用URL链接
|
||||
...doc.querySelectorAll('.fz-mid a'), // 中间大小的链接
|
||||
...doc.querySelectorAll('.vrTitle a'), // 另一种标题链接
|
||||
...doc.querySelectorAll('.fb a'), // 可能的链接
|
||||
...doc.querySelectorAll('.results a'), // 结果链接
|
||||
|
||||
// 更多选择器,适应可能的DOM变化
|
||||
...doc.querySelectorAll('.rb a'), // 右侧栏链接
|
||||
...doc.querySelectorAll('.vr_list a'), // 列表链接
|
||||
...doc.querySelectorAll('.vrResult a'), // 结果链接
|
||||
...doc.querySelectorAll('.vr_tit_a'), // 标题链接
|
||||
...doc.querySelectorAll('.vr_title a') // 另一种标题链接
|
||||
]
|
||||
|
||||
// 尝试解析SearX搜索结果 - 使用多个选择器来获取更多结果
|
||||
const searxResults = [
|
||||
// 标准结果选择器
|
||||
...doc.querySelectorAll('.result h4 a'), // 主要结果链接
|
||||
...doc.querySelectorAll('.result-content a'), // 结果内容中的链接
|
||||
...doc.querySelectorAll('.result-url'), // URL链接
|
||||
...doc.querySelectorAll('.result-header a'), // 结果头部链接
|
||||
...doc.querySelectorAll('.result-link'), // 结果链接
|
||||
...doc.querySelectorAll('.result a'), // 所有结果中的链接
|
||||
|
||||
// 更多选择器,适应可能的DOM变化
|
||||
...doc.querySelectorAll('.results a'), // 结果列表中的链接
|
||||
...doc.querySelectorAll('article a'), // 文章中的链接
|
||||
...doc.querySelectorAll('.url_wrapper a'), // URL包装器中的链接
|
||||
...doc.querySelectorAll('.external-link') // 外部链接
|
||||
]
|
||||
|
||||
if (baiduResults.length > 0) {
|
||||
// 这是Baidu搜索结果页面
|
||||
console.log('[DeepSearch] 检测到Baidu搜索结果页面')
|
||||
|
||||
// 使用Set去重
|
||||
const uniqueUrls = new Set<string>()
|
||||
|
||||
baiduResults.forEach((link) => {
|
||||
try {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
// 忽略无效链接
|
||||
}
|
||||
})
|
||||
} else if (bingResults.length > 0) {
|
||||
// 这是Bing搜索结果页面
|
||||
console.log('[DeepSearch] 检测到Bing搜索结果页面')
|
||||
|
||||
// 使用Set去重
|
||||
const uniqueUrls = new Set<string>()
|
||||
|
||||
bingResults.forEach((link) => {
|
||||
try {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
// 忽略无效链接
|
||||
}
|
||||
})
|
||||
} else if (sogouResults.length > 0 || htmlContent.includes('sogou.com')) {
|
||||
// 这是搜狗搜索结果页面
|
||||
console.log('[DeepSearch] 检测到搜狗搜索结果页面')
|
||||
|
||||
// 使用Set去重
|
||||
const uniqueUrls = new Set<string>()
|
||||
|
||||
sogouResults.forEach((link) => {
|
||||
try {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('sogou.com/web') &&
|
||||
!url.includes('duckduckgo.com/?q=') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
// 忽略无效链接
|
||||
}
|
||||
})
|
||||
|
||||
// 如果结果很少,尝试使用更通用的方法
|
||||
if (results.length < 10) {
|
||||
// 增加阈值
|
||||
console.log('[DeepSearch] 搜狗标准选择器找到的结果很少,尝试使用更通用的方法')
|
||||
|
||||
// 获取所有链接
|
||||
const allLinks = doc.querySelectorAll('a')
|
||||
|
||||
allLinks.forEach((link) => {
|
||||
try {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 更宽松的过滤条件
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('sogou.com/web') &&
|
||||
!url.includes('javascript:') &&
|
||||
!url.includes('mailto:') &&
|
||||
!url.includes('tel:') &&
|
||||
!uniqueUrls.has(url) &&
|
||||
title.trim().length > 0
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
// 忽略无效链接
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
console.log(`[DeepSearch] 搜狗找到 ${results.length} 个结果`)
|
||||
} else if (searxResults.length > 0 || htmlContent.includes('searx.tiekoetter.com')) {
|
||||
// 这是SearX搜索结果页面
|
||||
console.log('[DeepSearch] 检测到SearX搜索结果页面')
|
||||
|
||||
// 使用Set去重
|
||||
const uniqueUrls = new Set<string>()
|
||||
|
||||
searxResults.forEach((link) => {
|
||||
try {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('sogou.com/web') &&
|
||||
!url.includes('duckduckgo.com/?q=') &&
|
||||
!url.includes('searx.tiekoetter.com/search') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
// 忽略无效链接
|
||||
}
|
||||
})
|
||||
|
||||
// 如果结果很少,尝试使用更通用的方法
|
||||
if (results.length < 10) {
|
||||
console.log('[DeepSearch] SearX标准选择器找到的结果很少,尝试使用更通用的方法')
|
||||
|
||||
// 获取所有链接
|
||||
const allLinks = doc.querySelectorAll('a')
|
||||
|
||||
allLinks.forEach((link) => {
|
||||
try {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 更宽松的过滤条件
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('searx.tiekoetter.com/search') &&
|
||||
!url.includes('javascript:') &&
|
||||
!url.includes('mailto:') &&
|
||||
!url.includes('tel:') &&
|
||||
!uniqueUrls.has(url) &&
|
||||
title.trim().length > 0
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
// 忽略无效链接
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
console.log(`[DeepSearch] SearX找到 ${results.length} 个结果`)
|
||||
} else if (duckduckgoResults.length > 0 || htmlContent.includes('duckduckgo.com')) {
|
||||
// 这是DuckDuckGo搜索结果页面
|
||||
console.log('[DeepSearch] 检测到DuckDuckGo搜索结果页面')
|
||||
|
||||
// 使用Set去重
|
||||
const uniqueUrls = new Set<string>()
|
||||
|
||||
// 如果标准选择器没有找到结果,尝试使用更通用的方法
|
||||
if (duckduckgoResults.length < 10) {
|
||||
// 增加阈值
|
||||
console.log('[DeepSearch] DuckDuckGo标准选择器找到的结果很少,尝试使用更通用的方法')
|
||||
|
||||
// 获取所有链接
|
||||
const allLinks = doc.querySelectorAll('a')
|
||||
|
||||
allLinks.forEach((link) => {
|
||||
try {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 更宽松的过滤条件,为DuckDuckGo特别定制
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('duckduckgo.com') &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('javascript:') &&
|
||||
!url.includes('mailto:') &&
|
||||
!url.includes('tel:') &&
|
||||
!url.includes('about:') &&
|
||||
!url.includes('chrome:') &&
|
||||
!url.includes('file:') &&
|
||||
!url.includes('login') &&
|
||||
!url.includes('signup') &&
|
||||
!url.includes('account') &&
|
||||
!uniqueUrls.has(url) &&
|
||||
title.trim().length > 0
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
// 忽略无效链接
|
||||
}
|
||||
})
|
||||
} else {
|
||||
// 使用标准选择器找到的结果
|
||||
duckduckgoResults.forEach((link) => {
|
||||
try {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉搜索引擎内部链接和重复链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('duckduckgo.com/?q=') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim() || url,
|
||||
url: url
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
// 忽略无效链接
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// 如果结果仍然很少,尝试使用更激进的方法
|
||||
if (results.length < 10 && htmlContent.includes('duckduckgo.com')) {
|
||||
// 增加阈值
|
||||
console.log('[DeepSearch] DuckDuckGo结果仍然很少,尝试提取所有可能的URL')
|
||||
|
||||
// 从整个HTML中提取URL
|
||||
const urlRegex = /https?:\/\/[^\s"'<>()]+/g
|
||||
let match: RegExpExecArray | null
|
||||
|
||||
while ((match = urlRegex.exec(htmlContent)) !== null) {
|
||||
const url = match[0]
|
||||
|
||||
// 过滤掉搜索引擎内部URL和重复链接
|
||||
if (
|
||||
!url.includes('duckduckgo.com') &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('sogou.com/web') &&
|
||||
!url.includes('searx.tiekoetter.com/search') &&
|
||||
!uniqueUrls.has(url)
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: url,
|
||||
url: url
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[DeepSearch] DuckDuckGo找到 ${results.length} 个结果`)
|
||||
} else {
|
||||
// 如果不能识别搜索引擎,尝试通用解析
|
||||
console.log('[DeepSearch] 使用通用解析方法')
|
||||
|
||||
// 查找所有链接
|
||||
const links = doc.querySelectorAll('a')
|
||||
const uniqueUrls = new Set<string>()
|
||||
|
||||
links.forEach((link) => {
|
||||
try {
|
||||
const url = (link as HTMLAnchorElement).href
|
||||
const title = link.textContent || url
|
||||
|
||||
// 过滤掉无效链接和搜索引擎内部链接
|
||||
if (
|
||||
url &&
|
||||
(url.startsWith('http') || url.startsWith('https')) &&
|
||||
!url.includes('google.com/search') &&
|
||||
!url.includes('bing.com/search') &&
|
||||
!url.includes('baidu.com/s?') &&
|
||||
!url.includes('duckduckgo.com/?q=') &&
|
||||
!url.includes('sogou.com/web') &&
|
||||
!url.includes('searx.tiekoetter.com/search') &&
|
||||
!uniqueUrls.has(url) &&
|
||||
// 过滤掉常见的无用链接
|
||||
!url.includes('javascript:') &&
|
||||
!url.includes('mailto:') &&
|
||||
!url.includes('tel:') &&
|
||||
!url.includes('login') &&
|
||||
!url.includes('register') &&
|
||||
!url.includes('signup') &&
|
||||
!url.includes('signin') &&
|
||||
title.trim().length > 0
|
||||
) {
|
||||
uniqueUrls.add(url)
|
||||
results.push({
|
||||
title: title.trim(),
|
||||
url: url
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
// 忽略无效链接
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
console.log(`[DeepSearch] 解析到 ${results.length} 个有效链接`)
|
||||
} catch (error) {
|
||||
console.error('[DeepSearch] 解析HTML失败:', error)
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
/**
|
||||
* 深度抓取内容
|
||||
* 不仅抓取搜索结果页面,还会抓取页面中的链接
|
||||
*/
|
||||
private async fetchContentsWithDepth(
|
||||
items: Array<{ title: string; url: string; source?: string }>,
|
||||
_websearch: WebSearchState,
|
||||
depth: number = 1
|
||||
): Promise<WebSearchResult[]> {
|
||||
console.log(`[DeepSearch] 开始并行深度抓取,深度: ${depth}`)
|
||||
|
||||
// 第一层:并行抓取初始URL的内容
|
||||
const firstLevelResults = await Promise.all(
|
||||
items.map(async (item) => {
|
||||
console.log(`[DeepSearch] 抓取页面: ${item.url}`)
|
||||
try {
|
||||
const result = await fetchWebContent(item.url, 'markdown', this.provider.usingBrowser)
|
||||
|
||||
// 应用内容长度限制
|
||||
if (
|
||||
this.provider.contentLimit &&
|
||||
this.provider.contentLimit !== -1 &&
|
||||
result.content.length > this.provider.contentLimit
|
||||
) {
|
||||
result.content = result.content.slice(0, this.provider.contentLimit) + '...'
|
||||
}
|
||||
|
||||
// 添加来源信息
|
||||
if (item.source) {
|
||||
result.source = item.source
|
||||
}
|
||||
|
||||
return result
|
||||
} catch (error) {
|
||||
console.error(`[DeepSearch] 抓取 ${item.url} 失败:`, error)
|
||||
return {
|
||||
title: item.title,
|
||||
content: noContent,
|
||||
url: item.url,
|
||||
source: item.source
|
||||
}
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
// 如果深度为1,直接返回第一层结果
|
||||
if (depth <= 1) {
|
||||
return firstLevelResults
|
||||
}
|
||||
|
||||
// 第二层:从第一层内容中提取链接并抓取
|
||||
const secondLevelUrls: Set<string> = new Set()
|
||||
|
||||
// 从第一层结果中提取链接
|
||||
firstLevelResults.forEach((result) => {
|
||||
if (result.content !== noContent) {
|
||||
// 从Markdown内容中提取URL
|
||||
const urls = this.extractUrlsFromMarkdown(result.content)
|
||||
urls.forEach((url) => secondLevelUrls.add(url))
|
||||
}
|
||||
})
|
||||
|
||||
// 不限制第二层URL数量,获取更多结果
|
||||
const maxSecondLevelUrls = Math.min(secondLevelUrls.size, 30) // 增加到30个
|
||||
const secondLevelUrlsArray = Array.from(secondLevelUrls).slice(0, maxSecondLevelUrls)
|
||||
|
||||
console.log(`[DeepSearch] 第二层找到 ${secondLevelUrls.size} 个URL,将抓取 ${secondLevelUrlsArray.length} 个`)
|
||||
|
||||
// 抓取第二层URL的内容
|
||||
const secondLevelItems = secondLevelUrlsArray.map((url) => ({
|
||||
title: url,
|
||||
url: url,
|
||||
source: '深度链接' // 标记为深度链接
|
||||
}))
|
||||
|
||||
const secondLevelResults = await Promise.all(
|
||||
secondLevelItems.map(async (item) => {
|
||||
console.log(`[DeepSearch] 抓取第二层页面: ${item.url}`)
|
||||
try {
|
||||
const result = await fetchWebContent(item.url, 'markdown', this.provider.usingBrowser)
|
||||
|
||||
// 应用内容长度限制
|
||||
if (
|
||||
this.provider.contentLimit &&
|
||||
this.provider.contentLimit !== -1 &&
|
||||
result.content.length > this.provider.contentLimit
|
||||
) {
|
||||
result.content = result.content.slice(0, this.provider.contentLimit) + '...'
|
||||
}
|
||||
|
||||
// 标记为第二层结果
|
||||
result.title = `[深度] ${result.title}`
|
||||
result.source = item.source
|
||||
|
||||
return result
|
||||
} catch (error) {
|
||||
console.error(`[DeepSearch] 抓取第二层 ${item.url} 失败:`, error)
|
||||
return {
|
||||
title: `[深度] ${item.title}`,
|
||||
content: noContent,
|
||||
url: item.url,
|
||||
source: item.source
|
||||
}
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
// 合并两层结果
|
||||
return [...firstLevelResults, ...secondLevelResults.filter((result) => result.content !== noContent)]
|
||||
}
|
||||
|
||||
/**
|
||||
* 从Markdown内容中提取URL
|
||||
*/
|
||||
private extractUrlsFromMarkdown(markdown: string): string[] {
|
||||
const urls: Set<string> = new Set()
|
||||
|
||||
// 匹配Markdown链接格式 [text](url)
|
||||
const markdownLinkRegex = /\[([^\]]+)\]\(([^)]+)\)/g
|
||||
let match: RegExpExecArray | null
|
||||
|
||||
while ((match = markdownLinkRegex.exec(markdown)) !== null) {
|
||||
const url = match[2]
|
||||
if (url && (url.startsWith('http') || url.startsWith('https'))) {
|
||||
urls.add(url)
|
||||
}
|
||||
}
|
||||
|
||||
// 匹配纯文本URL
|
||||
const urlRegex = /(https?:\/\/[^\s]+)/g
|
||||
while ((match = urlRegex.exec(markdown)) !== null) {
|
||||
const url = match[1]
|
||||
if (url) {
|
||||
urls.add(url)
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(urls)
|
||||
}
|
||||
}
|
||||
@ -2,6 +2,7 @@ import { WebSearchProvider } from '@renderer/types'
|
||||
|
||||
import BaseWebSearchProvider from './BaseWebSearchProvider'
|
||||
import DefaultProvider from './DefaultProvider'
|
||||
import DeepSearchProvider from './DeepSearchProvider'
|
||||
import ExaProvider from './ExaProvider'
|
||||
import LocalBaiduProvider from './LocalBaiduProvider'
|
||||
import LocalBingProvider from './LocalBingProvider'
|
||||
@ -18,6 +19,8 @@ export default class WebSearchProviderFactory {
|
||||
return new SearxngProvider(provider)
|
||||
case 'exa':
|
||||
return new ExaProvider(provider)
|
||||
case 'deep-search':
|
||||
return new DeepSearchProvider(provider)
|
||||
case 'local-google':
|
||||
return new LocalGoogleProvider(provider)
|
||||
case 'local-baidu':
|
||||
|
||||
@ -45,6 +45,11 @@ class WebSearchService {
|
||||
return provider.apiHost !== ''
|
||||
}
|
||||
|
||||
// DeepSearch提供商不需要API密钥或主机
|
||||
if (provider.id === 'deep-search') {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
@ -43,6 +43,13 @@ const initialState: WebSearchState = {
|
||||
name: 'Exa',
|
||||
apiKey: ''
|
||||
},
|
||||
{
|
||||
id: 'deep-search',
|
||||
name: 'DeepSearch',
|
||||
description: '多引擎深度搜索',
|
||||
usingBrowser: true,
|
||||
contentLimit: 10000
|
||||
},
|
||||
{
|
||||
id: 'local-google',
|
||||
name: 'Google',
|
||||
@ -60,7 +67,7 @@ const initialState: WebSearchState = {
|
||||
}
|
||||
],
|
||||
searchWithTime: true,
|
||||
maxResults: 5,
|
||||
maxResults: 10,
|
||||
excludeDomains: [],
|
||||
subscribeSources: [],
|
||||
overwrite: false
|
||||
|
||||
@ -362,6 +362,7 @@ export type WebSearchResult = {
|
||||
title: string
|
||||
content: string
|
||||
url: string
|
||||
source?: string
|
||||
}
|
||||
|
||||
export type KnowledgeReference = {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user