deepsearch

This commit is contained in:
1600822305 2025-04-24 01:04:27 +08:00
parent 1b2d15f2e8
commit d33e16fa81
8 changed files with 960 additions and 1 deletions

View File

@ -6,7 +6,9 @@ import { HashRouter, Route, Routes } from 'react-router-dom'
import { PersistGate } from 'redux-persist/integration/react'
import Sidebar from './components/app/Sidebar'
import GeminiInitializer from './components/GeminiInitializer'
import TopViewContainer from './components/TopView'
import WebSearchInitializer from './components/WebSearchInitializer'
import AntdProvider from './context/AntdProvider'
import StyleSheetManager from './context/StyleSheetManager'
import { SyntaxHighlighterProvider } from './context/SyntaxHighlighterProvider'
@ -29,6 +31,8 @@ function App(): React.ReactElement {
<AntdProvider>
<SyntaxHighlighterProvider>
<PersistGate loading={null} persistor={persistor}>
<GeminiInitializer />
<WebSearchInitializer />
<TopViewContainer>
<HashRouter>
<NavigationHandler />

View File

@ -0,0 +1,35 @@
import { useEffect } from 'react'
import { useDispatch, useSelector } from 'react-redux'
import { RootState } from '@renderer/store'
import { updateProvider } from '@renderer/store/llm'
/**
* GeminiInitializer组件
* Gemini API的配置
* API密钥Gemini API
*/
const GeminiInitializer = () => {
const dispatch = useDispatch()
const providers = useSelector((state: RootState) => state.llm.providers)
useEffect(() => {
// 检查Gemini提供商
const geminiProvider = providers.find((provider) => provider.id === 'gemini')
// 如果Gemini提供商存在且已启用但没有API密钥则禁用它
if (geminiProvider && geminiProvider.enabled && !geminiProvider.apiKey) {
dispatch(
updateProvider({
...geminiProvider,
enabled: false
})
)
console.log('Gemini API disabled due to missing API key')
}
}, [dispatch, providers])
// 这是一个初始化组件不需要渲染任何UI
return null
}
export default GeminiInitializer

View File

@ -0,0 +1,37 @@
import { useEffect } from 'react'
import { useDispatch, useSelector } from 'react-redux'
import { RootState } from '@renderer/store'
import { addWebSearchProvider } from '@renderer/store/websearch'
import { WebSearchProvider } from '@renderer/types'
/**
* WebSearchInitializer组件
* WebSearchService
* DeepSearch在应用启动时被正确设置
*/
const WebSearchInitializer = () => {
const dispatch = useDispatch()
const providers = useSelector((state: RootState) => state.websearch.providers)
useEffect(() => {
// 检查是否已经存在DeepSearch提供商
const hasDeepSearch = providers.some((provider) => provider.id === 'deep-search')
// 如果不存在添加DeepSearch提供商
if (!hasDeepSearch) {
const deepSearchProvider: WebSearchProvider = {
id: 'deep-search',
name: 'DeepSearch',
description: '多引擎深度搜索',
usingBrowser: true,
contentLimit: 10000
}
dispatch(addWebSearchProvider(deepSearchProvider))
}
}, [dispatch, providers])
// 这是一个初始化组件不需要渲染任何UI
return null
}
export default WebSearchInitializer

View File

@ -0,0 +1,867 @@
import { nanoid } from '@reduxjs/toolkit'
import { WebSearchState } from '@renderer/store/websearch'
import { WebSearchProvider, WebSearchResponse, WebSearchResult } from '@renderer/types'
import { fetchWebContent, noContent } from '@renderer/utils/fetch'
// 定义分析结果类型
interface AnalyzedResult extends WebSearchResult {
summary?: string // 内容摘要
keywords?: string[] // 关键词
relevanceScore?: number // 相关性评分
}
import BaseWebSearchProvider from './BaseWebSearchProvider'
export default class DeepSearchProvider extends BaseWebSearchProvider {
// 定义默认的搜索引擎URLs
private searchEngines = [
{ name: 'Baidu', url: 'https://www.baidu.com/s?wd=%s' },
{ name: 'Bing', url: 'https://cn.bing.com/search?q=%s&ensearch=1' },
{ name: 'DuckDuckGo', url: 'https://duckduckgo.com/?q=%s&t=h_' },
{ name: 'Sogou', url: 'https://www.sogou.com/web?query=%s' },
{
name: 'SearX',
url: 'https://searx.tiekoetter.com/search?q=%s&categories=general&language=auto&time_range=&safesearch=0&theme=simple'
}
]
// 分析模型配置
private analyzeConfig = {
enabled: true, // 是否启用预分析
maxSummaryLength: 300, // 每个结果的摘要最大长度
batchSize: 3 // 每批分析的结果数量
}
constructor(provider: WebSearchProvider) {
super(provider)
// 不再强制要求provider.url因为我们有默认的搜索引擎
}
public async search(query: string, websearch: WebSearchState): Promise<WebSearchResponse> {
try {
if (!query.trim()) {
throw new Error('Search query cannot be empty')
}
const cleanedQuery = query.split('\r\n')[1] ?? query
console.log(`[DeepSearch] 开始多引擎并行搜索: ${cleanedQuery}`)
// 存储所有搜索引擎的结果
const allItems: Array<{ title: string; url: string; source: string }> = []
// 并行搜索所有引擎
const searchPromises = this.searchEngines.map(async (engine) => {
try {
const uid = `deep-search-${engine.name.toLowerCase()}-${nanoid()}`
const url = engine.url.replace('%s', encodeURIComponent(cleanedQuery))
console.log(`[DeepSearch] 使用${engine.name}搜索: ${url}`)
// 使用搜索窗口获取搜索结果页面内容
const content = await window.api.searchService.openUrlInSearchWindow(uid, url)
// 解析搜索结果页面中的URL
const searchItems = this.parseValidUrls(content)
console.log(`[DeepSearch] ${engine.name}找到 ${searchItems.length} 个结果`)
// 添加搜索引擎标记
return searchItems.map((item) => ({
...item,
source: engine.name
}))
} catch (engineError) {
console.error(`[DeepSearch] ${engine.name}搜索失败:`, engineError)
// 如果失败返回空数组
return []
}
})
// 如果用户在provider中指定了URL也并行搜索
if (this.provider.url) {
searchPromises.push(
(async () => {
try {
const uid = `deep-search-custom-${nanoid()}`
const url = this.provider.url ? this.provider.url.replace('%s', encodeURIComponent(cleanedQuery)) : ''
console.log(`[DeepSearch] 使用自定义搜索: ${url}`)
// 使用搜索窗口获取搜索结果页面内容
const content = await window.api.searchService.openUrlInSearchWindow(uid, url)
// 解析搜索结果页面中的URL
const searchItems = this.parseValidUrls(content)
console.log(`[DeepSearch] 自定义搜索找到 ${searchItems.length} 个结果`)
// 添加搜索引擎标记
return searchItems.map((item) => ({
...item,
source: '自定义'
}))
} catch (customError) {
console.error('[DeepSearch] 自定义搜索失败:', customError)
return []
}
})()
)
}
// 等待所有搜索完成
const searchResults = await Promise.all(searchPromises)
// 合并所有搜索结果
for (const results of searchResults) {
allItems.push(...results)
}
console.log(`[DeepSearch] 总共找到 ${allItems.length} 个结果`)
// 去重使用URL作为唯一标识
const uniqueUrls = new Set<string>()
const uniqueItems = allItems.filter((item) => {
if (uniqueUrls.has(item.url)) {
return false
}
uniqueUrls.add(item.url)
return true
})
console.log(`[DeepSearch] 去重后有 ${uniqueItems.length} 个结果`)
// 过滤有效的URL不限制数量
const validItems = uniqueItems.filter((item) => item.url.startsWith('http') || item.url.startsWith('https'))
console.log(`[DeepSearch] 过滤后有 ${validItems.length} 个有效结果`)
// 第二步抓取每个URL的内容
const results = await this.fetchContentsWithDepth(validItems, websearch)
// 如果启用了预分析,对结果进行分析
let analyzedResults = results
if (this.analyzeConfig.enabled) {
analyzedResults = await this.analyzeResults(results, cleanedQuery)
}
// 在标题中添加搜索引擎来源和摘要
const resultsWithSource = analyzedResults.map((result, index) => {
if (index < validItems.length) {
// 如果有摘要,在内容前面添加摘要
let enhancedContent = result.content
const summary = (result as AnalyzedResult).summary
if (summary && summary !== enhancedContent.substring(0, summary.length)) {
enhancedContent = `**摘要**: ${summary}\n\n---\n\n${enhancedContent}`
}
// 如果有关键词,在内容前面添加关键词
const keywords = (result as AnalyzedResult).keywords
if (keywords && keywords.length > 0) {
enhancedContent = `**关键词**: ${keywords.join(', ')}\n\n${enhancedContent}`
}
return {
...result,
title: `[${validItems[index].source}] ${result.title}`,
content: enhancedContent
}
}
return result
})
// 按相关性排序
const sortedResults = [...resultsWithSource].sort((a, b) => {
const scoreA = (a as AnalyzedResult).relevanceScore || 0
const scoreB = (b as AnalyzedResult).relevanceScore || 0
return scoreB - scoreA
})
return {
query: query,
results: sortedResults.filter((result) => result.content !== noContent)
}
} catch (error) {
console.error('[DeepSearch] 搜索失败:', error)
throw new Error(`DeepSearch failed: ${error instanceof Error ? error.message : 'Unknown error'}`)
}
}
/**
*
* @param results
* @param query
* @returns
*/
private async analyzeResults(results: WebSearchResult[], query: string): Promise<AnalyzedResult[]> {
console.log(`[DeepSearch] 开始分析 ${results.length} 个结果`)
// 分批处理,避免处理过多内容
const batchSize = this.analyzeConfig.batchSize
const analyzedResults: AnalyzedResult[] = [...results] // 复制原始结果
// 简单的分析逻辑:提取前几句作为摘要
for (let i = 0; i < results.length; i++) {
const result = results[i]
if (result.content === noContent) continue
try {
// 提取摘要简单实现取前300个字符
const maxLength = this.analyzeConfig.maxSummaryLength
let summary = result.content.replace(/\n+/g, ' ').replace(/\s+/g, ' ').trim()
if (summary.length > maxLength) {
// 截取到最后一个完整的句子
summary = summary.substring(0, maxLength)
const lastPeriod = summary.lastIndexOf('.')
if (lastPeriod > maxLength * 0.7) {
// 至少要有总长度的70%
summary = summary.substring(0, lastPeriod + 1)
}
summary += '...'
}
// 提取关键词(简单实现,基于查询词拆分)
const keywords = query
.split(/\s+/)
.filter((word) => word.length > 2 && result.content.toLowerCase().includes(word.toLowerCase()))
// 计算相关性评分(简单实现,基于关键词出现频率)
let relevanceScore = 0
if (keywords.length > 0) {
const contentLower = result.content.toLowerCase()
for (const word of keywords) {
const wordLower = word.toLowerCase()
// 计算关键词出现的次数
let count = 0
let pos = contentLower.indexOf(wordLower)
while (pos !== -1) {
count++
pos = contentLower.indexOf(wordLower, pos + 1)
}
relevanceScore += count
}
// 标准化评分范围为0-1
relevanceScore = Math.min(1, relevanceScore / (contentLower.length / 100))
}
// 更新分析结果
analyzedResults[i] = {
...analyzedResults[i],
summary,
keywords,
relevanceScore
}
// 每处理一批打印一次日志
if (i % batchSize === 0 || i === results.length - 1) {
console.log(`[DeepSearch] 已分析 ${i + 1}/${results.length} 个结果`)
}
} catch (error) {
console.error(`[DeepSearch] 分析结果 ${i} 失败:`, error)
}
}
// 按相关性排序
analyzedResults.sort((a, b) => {
const scoreA = (a as AnalyzedResult).relevanceScore || 0
const scoreB = (b as AnalyzedResult).relevanceScore || 0
return scoreB - scoreA
})
console.log(`[DeepSearch] 完成分析 ${results.length} 个结果`)
return analyzedResults
}
/**
* URL
*
*/
protected parseValidUrls(htmlContent: string): Array<{ title: string; url: string }> {
const results: Array<{ title: string; url: string }> = []
try {
// 通用解析逻辑,查找所有链接
const parser = new DOMParser()
const doc = parser.parseFromString(htmlContent, 'text/html')
// 尝试解析Baidu搜索结果 - 使用多个选择器来获取更多结果
const baiduResults = [
...doc.querySelectorAll('#content_left .result h3 a'),
...doc.querySelectorAll('#content_left .c-container h3 a'),
...doc.querySelectorAll('#content_left .c-container a.c-title'),
...doc.querySelectorAll('#content_left a[data-click]')
]
// 尝试解析Bing搜索结果 - 使用多个选择器来获取更多结果
const bingResults = [
...doc.querySelectorAll('.b_algo h2 a'),
...doc.querySelectorAll('.b_algo a.tilk'),
...doc.querySelectorAll('.b_algo a.b_title'),
...doc.querySelectorAll('.b_results a.b_restorLink')
]
// 尝试解析DuckDuckGo搜索结果 - 使用多个选择器来获取更多结果
// 注意DuckDuckGo的DOM结构可能会变化所以我们使用多种选择器
const duckduckgoResults = [
// 标准结果选择器
...doc.querySelectorAll('.result__a'), // 主要结果链接
...doc.querySelectorAll('.result__url'), // URL链接
...doc.querySelectorAll('.result__snippet a'), // 片段中的链接
...doc.querySelectorAll('.results_links_deep a'), // 深度链接
// 新的选择器适应可能的DOM变化
...doc.querySelectorAll('a.result__check'), // 可能的新结果链接
...doc.querySelectorAll('a.js-result-title-link'), // 可能的标题链接
...doc.querySelectorAll('article a'), // 文章中的链接
...doc.querySelectorAll('.nrn-react-div a'), // React渲染的链接
// 通用选择器,捕获更多可能的结果
...doc.querySelectorAll('a[href*="http"]'), // 所有外部链接
...doc.querySelectorAll('a[data-testid]'), // 所有测试ID链接
...doc.querySelectorAll('.module a') // 模块中的链接
]
// 尝试解析搜狗搜索结果 - 使用多个选择器来获取更多结果
const sogouResults = [
// 标准结果选择器
...doc.querySelectorAll('.vrwrap h3 a'), // 主要结果链接
...doc.querySelectorAll('.vr-title a'), // 标题链接
...doc.querySelectorAll('.citeurl a'), // 引用URL链接
...doc.querySelectorAll('.fz-mid a'), // 中间大小的链接
...doc.querySelectorAll('.vrTitle a'), // 另一种标题链接
...doc.querySelectorAll('.fb a'), // 可能的链接
...doc.querySelectorAll('.results a'), // 结果链接
// 更多选择器适应可能的DOM变化
...doc.querySelectorAll('.rb a'), // 右侧栏链接
...doc.querySelectorAll('.vr_list a'), // 列表链接
...doc.querySelectorAll('.vrResult a'), // 结果链接
...doc.querySelectorAll('.vr_tit_a'), // 标题链接
...doc.querySelectorAll('.vr_title a') // 另一种标题链接
]
// 尝试解析SearX搜索结果 - 使用多个选择器来获取更多结果
const searxResults = [
// 标准结果选择器
...doc.querySelectorAll('.result h4 a'), // 主要结果链接
...doc.querySelectorAll('.result-content a'), // 结果内容中的链接
...doc.querySelectorAll('.result-url'), // URL链接
...doc.querySelectorAll('.result-header a'), // 结果头部链接
...doc.querySelectorAll('.result-link'), // 结果链接
...doc.querySelectorAll('.result a'), // 所有结果中的链接
// 更多选择器适应可能的DOM变化
...doc.querySelectorAll('.results a'), // 结果列表中的链接
...doc.querySelectorAll('article a'), // 文章中的链接
...doc.querySelectorAll('.url_wrapper a'), // URL包装器中的链接
...doc.querySelectorAll('.external-link') // 外部链接
]
if (baiduResults.length > 0) {
// 这是Baidu搜索结果页面
console.log('[DeepSearch] 检测到Baidu搜索结果页面')
// 使用Set去重
const uniqueUrls = new Set<string>()
baiduResults.forEach((link) => {
try {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!uniqueUrls.has(url)
) {
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
})
}
} catch (error) {
// 忽略无效链接
}
})
} else if (bingResults.length > 0) {
// 这是Bing搜索结果页面
console.log('[DeepSearch] 检测到Bing搜索结果页面')
// 使用Set去重
const uniqueUrls = new Set<string>()
bingResults.forEach((link) => {
try {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!uniqueUrls.has(url)
) {
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
})
}
} catch (error) {
// 忽略无效链接
}
})
} else if (sogouResults.length > 0 || htmlContent.includes('sogou.com')) {
// 这是搜狗搜索结果页面
console.log('[DeepSearch] 检测到搜狗搜索结果页面')
// 使用Set去重
const uniqueUrls = new Set<string>()
sogouResults.forEach((link) => {
try {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('sogou.com/web') &&
!url.includes('duckduckgo.com/?q=') &&
!uniqueUrls.has(url)
) {
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
})
}
} catch (error) {
// 忽略无效链接
}
})
// 如果结果很少,尝试使用更通用的方法
if (results.length < 10) {
// 增加阈值
console.log('[DeepSearch] 搜狗标准选择器找到的结果很少,尝试使用更通用的方法')
// 获取所有链接
const allLinks = doc.querySelectorAll('a')
allLinks.forEach((link) => {
try {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 更宽松的过滤条件
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('sogou.com/web') &&
!url.includes('javascript:') &&
!url.includes('mailto:') &&
!url.includes('tel:') &&
!uniqueUrls.has(url) &&
title.trim().length > 0
) {
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
})
}
} catch (error) {
// 忽略无效链接
}
})
}
console.log(`[DeepSearch] 搜狗找到 ${results.length} 个结果`)
} else if (searxResults.length > 0 || htmlContent.includes('searx.tiekoetter.com')) {
// 这是SearX搜索结果页面
console.log('[DeepSearch] 检测到SearX搜索结果页面')
// 使用Set去重
const uniqueUrls = new Set<string>()
searxResults.forEach((link) => {
try {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('sogou.com/web') &&
!url.includes('duckduckgo.com/?q=') &&
!url.includes('searx.tiekoetter.com/search') &&
!uniqueUrls.has(url)
) {
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
})
}
} catch (error) {
// 忽略无效链接
}
})
// 如果结果很少,尝试使用更通用的方法
if (results.length < 10) {
console.log('[DeepSearch] SearX标准选择器找到的结果很少尝试使用更通用的方法')
// 获取所有链接
const allLinks = doc.querySelectorAll('a')
allLinks.forEach((link) => {
try {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 更宽松的过滤条件
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('searx.tiekoetter.com/search') &&
!url.includes('javascript:') &&
!url.includes('mailto:') &&
!url.includes('tel:') &&
!uniqueUrls.has(url) &&
title.trim().length > 0
) {
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
})
}
} catch (error) {
// 忽略无效链接
}
})
}
console.log(`[DeepSearch] SearX找到 ${results.length} 个结果`)
} else if (duckduckgoResults.length > 0 || htmlContent.includes('duckduckgo.com')) {
// 这是DuckDuckGo搜索结果页面
console.log('[DeepSearch] 检测到DuckDuckGo搜索结果页面')
// 使用Set去重
const uniqueUrls = new Set<string>()
// 如果标准选择器没有找到结果,尝试使用更通用的方法
if (duckduckgoResults.length < 10) {
// 增加阈值
console.log('[DeepSearch] DuckDuckGo标准选择器找到的结果很少尝试使用更通用的方法')
// 获取所有链接
const allLinks = doc.querySelectorAll('a')
allLinks.forEach((link) => {
try {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 更宽松的过滤条件为DuckDuckGo特别定制
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('duckduckgo.com') &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('javascript:') &&
!url.includes('mailto:') &&
!url.includes('tel:') &&
!url.includes('about:') &&
!url.includes('chrome:') &&
!url.includes('file:') &&
!url.includes('login') &&
!url.includes('signup') &&
!url.includes('account') &&
!uniqueUrls.has(url) &&
title.trim().length > 0
) {
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
})
}
} catch (error) {
// 忽略无效链接
}
})
} else {
// 使用标准选择器找到的结果
duckduckgoResults.forEach((link) => {
try {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉搜索引擎内部链接和重复链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('duckduckgo.com/?q=') &&
!uniqueUrls.has(url)
) {
uniqueUrls.add(url)
results.push({
title: title.trim() || url,
url: url
})
}
} catch (error) {
// 忽略无效链接
}
})
}
// 如果结果仍然很少,尝试使用更激进的方法
if (results.length < 10 && htmlContent.includes('duckduckgo.com')) {
// 增加阈值
console.log('[DeepSearch] DuckDuckGo结果仍然很少尝试提取所有可能的URL')
// 从整个HTML中提取URL
const urlRegex = /https?:\/\/[^\s"'<>()]+/g
let match: RegExpExecArray | null
while ((match = urlRegex.exec(htmlContent)) !== null) {
const url = match[0]
// 过滤掉搜索引擎内部URL和重复链接
if (
!url.includes('duckduckgo.com') &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('sogou.com/web') &&
!url.includes('searx.tiekoetter.com/search') &&
!uniqueUrls.has(url)
) {
uniqueUrls.add(url)
results.push({
title: url,
url: url
})
}
}
}
console.log(`[DeepSearch] DuckDuckGo找到 ${results.length} 个结果`)
} else {
// 如果不能识别搜索引擎,尝试通用解析
console.log('[DeepSearch] 使用通用解析方法')
// 查找所有链接
const links = doc.querySelectorAll('a')
const uniqueUrls = new Set<string>()
links.forEach((link) => {
try {
const url = (link as HTMLAnchorElement).href
const title = link.textContent || url
// 过滤掉无效链接和搜索引擎内部链接
if (
url &&
(url.startsWith('http') || url.startsWith('https')) &&
!url.includes('google.com/search') &&
!url.includes('bing.com/search') &&
!url.includes('baidu.com/s?') &&
!url.includes('duckduckgo.com/?q=') &&
!url.includes('sogou.com/web') &&
!url.includes('searx.tiekoetter.com/search') &&
!uniqueUrls.has(url) &&
// 过滤掉常见的无用链接
!url.includes('javascript:') &&
!url.includes('mailto:') &&
!url.includes('tel:') &&
!url.includes('login') &&
!url.includes('register') &&
!url.includes('signup') &&
!url.includes('signin') &&
title.trim().length > 0
) {
uniqueUrls.add(url)
results.push({
title: title.trim(),
url: url
})
}
} catch (error) {
// 忽略无效链接
}
})
}
console.log(`[DeepSearch] 解析到 ${results.length} 个有效链接`)
} catch (error) {
console.error('[DeepSearch] 解析HTML失败:', error)
}
return results
}
/**
*
*
*/
private async fetchContentsWithDepth(
items: Array<{ title: string; url: string; source?: string }>,
_websearch: WebSearchState,
depth: number = 1
): Promise<WebSearchResult[]> {
console.log(`[DeepSearch] 开始并行深度抓取,深度: ${depth}`)
// 第一层并行抓取初始URL的内容
const firstLevelResults = await Promise.all(
items.map(async (item) => {
console.log(`[DeepSearch] 抓取页面: ${item.url}`)
try {
const result = await fetchWebContent(item.url, 'markdown', this.provider.usingBrowser)
// 应用内容长度限制
if (
this.provider.contentLimit &&
this.provider.contentLimit !== -1 &&
result.content.length > this.provider.contentLimit
) {
result.content = result.content.slice(0, this.provider.contentLimit) + '...'
}
// 添加来源信息
if (item.source) {
result.source = item.source
}
return result
} catch (error) {
console.error(`[DeepSearch] 抓取 ${item.url} 失败:`, error)
return {
title: item.title,
content: noContent,
url: item.url,
source: item.source
}
}
})
)
// 如果深度为1直接返回第一层结果
if (depth <= 1) {
return firstLevelResults
}
// 第二层:从第一层内容中提取链接并抓取
const secondLevelUrls: Set<string> = new Set()
// 从第一层结果中提取链接
firstLevelResults.forEach((result) => {
if (result.content !== noContent) {
// 从Markdown内容中提取URL
const urls = this.extractUrlsFromMarkdown(result.content)
urls.forEach((url) => secondLevelUrls.add(url))
}
})
// 不限制第二层URL数量获取更多结果
const maxSecondLevelUrls = Math.min(secondLevelUrls.size, 30) // 增加到30个
const secondLevelUrlsArray = Array.from(secondLevelUrls).slice(0, maxSecondLevelUrls)
console.log(`[DeepSearch] 第二层找到 ${secondLevelUrls.size} 个URL将抓取 ${secondLevelUrlsArray.length}`)
// 抓取第二层URL的内容
const secondLevelItems = secondLevelUrlsArray.map((url) => ({
title: url,
url: url,
source: '深度链接' // 标记为深度链接
}))
const secondLevelResults = await Promise.all(
secondLevelItems.map(async (item) => {
console.log(`[DeepSearch] 抓取第二层页面: ${item.url}`)
try {
const result = await fetchWebContent(item.url, 'markdown', this.provider.usingBrowser)
// 应用内容长度限制
if (
this.provider.contentLimit &&
this.provider.contentLimit !== -1 &&
result.content.length > this.provider.contentLimit
) {
result.content = result.content.slice(0, this.provider.contentLimit) + '...'
}
// 标记为第二层结果
result.title = `[深度] ${result.title}`
result.source = item.source
return result
} catch (error) {
console.error(`[DeepSearch] 抓取第二层 ${item.url} 失败:`, error)
return {
title: `[深度] ${item.title}`,
content: noContent,
url: item.url,
source: item.source
}
}
})
)
// 合并两层结果
return [...firstLevelResults, ...secondLevelResults.filter((result) => result.content !== noContent)]
}
/**
* Markdown内容中提取URL
*/
private extractUrlsFromMarkdown(markdown: string): string[] {
const urls: Set<string> = new Set()
// 匹配Markdown链接格式 [text](url)
const markdownLinkRegex = /\[([^\]]+)\]\(([^)]+)\)/g
let match: RegExpExecArray | null
while ((match = markdownLinkRegex.exec(markdown)) !== null) {
const url = match[2]
if (url && (url.startsWith('http') || url.startsWith('https'))) {
urls.add(url)
}
}
// 匹配纯文本URL
const urlRegex = /(https?:\/\/[^\s]+)/g
while ((match = urlRegex.exec(markdown)) !== null) {
const url = match[1]
if (url) {
urls.add(url)
}
}
return Array.from(urls)
}
}

View File

@ -2,6 +2,7 @@ import { WebSearchProvider } from '@renderer/types'
import BaseWebSearchProvider from './BaseWebSearchProvider'
import DefaultProvider from './DefaultProvider'
import DeepSearchProvider from './DeepSearchProvider'
import ExaProvider from './ExaProvider'
import LocalBaiduProvider from './LocalBaiduProvider'
import LocalBingProvider from './LocalBingProvider'
@ -18,6 +19,8 @@ export default class WebSearchProviderFactory {
return new SearxngProvider(provider)
case 'exa':
return new ExaProvider(provider)
case 'deep-search':
return new DeepSearchProvider(provider)
case 'local-google':
return new LocalGoogleProvider(provider)
case 'local-baidu':

View File

@ -45,6 +45,11 @@ class WebSearchService {
return provider.apiHost !== ''
}
// DeepSearch提供商不需要API密钥或主机
if (provider.id === 'deep-search') {
return true
}
return false
}

View File

@ -43,6 +43,13 @@ const initialState: WebSearchState = {
name: 'Exa',
apiKey: ''
},
{
id: 'deep-search',
name: 'DeepSearch',
description: '多引擎深度搜索',
usingBrowser: true,
contentLimit: 10000
},
{
id: 'local-google',
name: 'Google',
@ -60,7 +67,7 @@ const initialState: WebSearchState = {
}
],
searchWithTime: true,
maxResults: 5,
maxResults: 10,
excludeDomains: [],
subscribeSources: [],
overwrite: false

View File

@ -362,6 +362,7 @@ export type WebSearchResult = {
title: string
content: string
url: string
source?: string
}
export type KnowledgeReference = {