mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2025-12-29 23:12:38 +08:00
feat(knowledge, preprocess): Add OpenMinerU preprocessor, knowledge base supports using open-source version of MinerU to process documents (#10992)
* feat(knowledge): 为文档处理增加open-mineru的支持 * 移除OpenMineruPreprocessProvider多余的apiKey处理 * Add preprocessProviders migrator for open-mineru * Remove invalid code from OpenMineruPreprocessProvider * fix property '"open-mineru"' is missing in PREPROCESS_PROVIDER_CONFIG * refactor(preprocess): improve OpenMinerU error handling and file cleanup
This commit is contained in:
parent
9a01e092f6
commit
888a183328
199
src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts
Normal file
199
src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts
Normal file
@ -0,0 +1,199 @@
|
||||
import fs from 'node:fs'
|
||||
import path from 'node:path'
|
||||
|
||||
import { loggerService } from '@logger'
|
||||
import { fileStorage } from '@main/services/FileStorage'
|
||||
import { FileMetadata, PreprocessProvider } from '@types'
|
||||
import AdmZip from 'adm-zip'
|
||||
import { net } from 'electron'
|
||||
import FormData from 'form-data'
|
||||
|
||||
import BasePreprocessProvider from './BasePreprocessProvider'
|
||||
|
||||
const logger = loggerService.withContext('MineruPreprocessProvider')
|
||||
|
||||
export default class OpenMineruPreprocessProvider extends BasePreprocessProvider {
|
||||
constructor(provider: PreprocessProvider, userId?: string) {
|
||||
super(provider, userId)
|
||||
}
|
||||
|
||||
public async parseFile(
|
||||
sourceId: string,
|
||||
file: FileMetadata
|
||||
): Promise<{ processedFile: FileMetadata; quota: number }> {
|
||||
try {
|
||||
const filePath = fileStorage.getFilePathById(file)
|
||||
logger.info(`Open MinerU preprocess processing started: ${filePath}`)
|
||||
await this.validateFile(filePath)
|
||||
|
||||
// 1. Update progress
|
||||
await this.sendPreprocessProgress(sourceId, 50)
|
||||
logger.info(`File ${file.name} is starting processing...`)
|
||||
|
||||
// 2. Upload file and extract
|
||||
const { path: outputPath } = await this.uploadFileAndExtract(file)
|
||||
|
||||
// 3. Check quota
|
||||
const quota = await this.checkQuota()
|
||||
|
||||
// 4. Create processed file info
|
||||
return {
|
||||
processedFile: this.createProcessedFileInfo(file, outputPath),
|
||||
quota
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Open MinerU preprocess processing failed for:`, error as Error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
public async checkQuota() {
|
||||
// self-hosted version always has enough quota
|
||||
return Infinity
|
||||
}
|
||||
|
||||
private async validateFile(filePath: string): Promise<void> {
|
||||
const pdfBuffer = await fs.promises.readFile(filePath)
|
||||
|
||||
const doc = await this.readPdf(pdfBuffer)
|
||||
|
||||
// File page count must be less than 600 pages
|
||||
if (doc.numPages >= 600) {
|
||||
throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 600 pages`)
|
||||
}
|
||||
// File size must be less than 200MB
|
||||
if (pdfBuffer.length >= 200 * 1024 * 1024) {
|
||||
const fileSizeMB = Math.round(pdfBuffer.length / (1024 * 1024))
|
||||
throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 200MB`)
|
||||
}
|
||||
}
|
||||
|
||||
private createProcessedFileInfo(file: FileMetadata, outputPath: string): FileMetadata {
|
||||
// Find the main file after extraction
|
||||
let finalPath = ''
|
||||
let finalName = file.origin_name.replace('.pdf', '.md')
|
||||
// Find the corresponding folder by file name
|
||||
outputPath = path.join(outputPath, `${file.origin_name.replace('.pdf', '')}`)
|
||||
try {
|
||||
const files = fs.readdirSync(outputPath)
|
||||
|
||||
const mdFile = files.find((f) => f.endsWith('.md'))
|
||||
if (mdFile) {
|
||||
const originalMdPath = path.join(outputPath, mdFile)
|
||||
const newMdPath = path.join(outputPath, finalName)
|
||||
|
||||
// Rename file to original file name
|
||||
try {
|
||||
fs.renameSync(originalMdPath, newMdPath)
|
||||
finalPath = newMdPath
|
||||
logger.info(`Renamed markdown file from ${mdFile} to ${finalName}`)
|
||||
} catch (renameError) {
|
||||
logger.warn(`Failed to rename file ${mdFile} to ${finalName}: ${renameError}`)
|
||||
// If rename fails, use the original file
|
||||
finalPath = originalMdPath
|
||||
finalName = mdFile
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to read output directory ${outputPath}:`, error as Error)
|
||||
finalPath = path.join(outputPath, `${file.id}.md`)
|
||||
}
|
||||
|
||||
return {
|
||||
...file,
|
||||
name: finalName,
|
||||
path: finalPath,
|
||||
ext: '.md',
|
||||
size: fs.existsSync(finalPath) ? fs.statSync(finalPath).size : 0
|
||||
}
|
||||
}
|
||||
|
||||
private async uploadFileAndExtract(
|
||||
file: FileMetadata,
|
||||
maxRetries: number = 5,
|
||||
intervalMs: number = 5000
|
||||
): Promise<{ path: string }> {
|
||||
let retries = 0
|
||||
|
||||
const endpoint = `${this.provider.apiHost}/file_parse`
|
||||
|
||||
// Get file stream
|
||||
const filePath = fileStorage.getFilePathById(file)
|
||||
const fileBuffer = await fs.promises.readFile(filePath)
|
||||
|
||||
const formData = new FormData()
|
||||
formData.append('return_md', 'true')
|
||||
formData.append('response_format_zip', 'true')
|
||||
formData.append('files', fileBuffer, {
|
||||
filename: file.origin_name
|
||||
})
|
||||
|
||||
while (retries < maxRetries) {
|
||||
let zipPath: string | undefined
|
||||
|
||||
try {
|
||||
const response = await net.fetch(endpoint, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
token: this.userId ?? '',
|
||||
...(this.provider.apiKey ? { Authorization: `Bearer ${this.provider.apiKey}` } : {}),
|
||||
...formData.getHeaders()
|
||||
},
|
||||
body: formData.getBuffer()
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
|
||||
}
|
||||
|
||||
// Check if response header is application/zip
|
||||
if (response.headers.get('content-type') !== 'application/zip') {
|
||||
throw new Error(`Downloaded ZIP file has unexpected content-type: ${response.headers.get('content-type')}`)
|
||||
}
|
||||
|
||||
const dirPath = this.storageDir
|
||||
|
||||
zipPath = path.join(dirPath, `${file.id}.zip`)
|
||||
const extractPath = path.join(dirPath, `${file.id}`)
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer()
|
||||
fs.writeFileSync(zipPath, Buffer.from(arrayBuffer))
|
||||
logger.info(`Downloaded ZIP file: ${zipPath}`)
|
||||
|
||||
// Ensure extraction directory exists
|
||||
if (!fs.existsSync(extractPath)) {
|
||||
fs.mkdirSync(extractPath, { recursive: true })
|
||||
}
|
||||
|
||||
// Extract files
|
||||
const zip = new AdmZip(zipPath)
|
||||
zip.extractAllTo(extractPath, true)
|
||||
logger.info(`Extracted files to: ${extractPath}`)
|
||||
|
||||
return { path: extractPath }
|
||||
} catch (error) {
|
||||
logger.warn(
|
||||
`Failed to upload and extract file: ${(error as Error).message}, retry ${retries + 1}/${maxRetries}`
|
||||
)
|
||||
if (retries === maxRetries - 1) {
|
||||
throw error
|
||||
}
|
||||
} finally {
|
||||
// Delete temporary ZIP file
|
||||
if (zipPath && fs.existsSync(zipPath)) {
|
||||
try {
|
||||
fs.unlinkSync(zipPath)
|
||||
logger.info(`Deleted temporary ZIP file: ${zipPath}`)
|
||||
} catch (deleteError) {
|
||||
logger.warn(`Failed to delete temporary ZIP file ${zipPath}:`, deleteError as Error)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
retries++
|
||||
await new Promise((resolve) => setTimeout(resolve, intervalMs))
|
||||
}
|
||||
|
||||
throw new Error(`Processing timeout for file: ${file.id}`)
|
||||
}
|
||||
}
|
||||
@ -5,6 +5,7 @@ import DefaultPreprocessProvider from './DefaultPreprocessProvider'
|
||||
import Doc2xPreprocessProvider from './Doc2xPreprocessProvider'
|
||||
import MineruPreprocessProvider from './MineruPreprocessProvider'
|
||||
import MistralPreprocessProvider from './MistralPreprocessProvider'
|
||||
import OpenMineruPreprocessProvider from './OpenMineruPreprocessProvider'
|
||||
export default class PreprocessProviderFactory {
|
||||
static create(provider: PreprocessProvider, userId?: string): BasePreprocessProvider {
|
||||
switch (provider.id) {
|
||||
@ -14,6 +15,8 @@ export default class PreprocessProviderFactory {
|
||||
return new MistralPreprocessProvider(provider)
|
||||
case 'mineru':
|
||||
return new MineruPreprocessProvider(provider, userId)
|
||||
case 'open-mineru':
|
||||
return new OpenMineruPreprocessProvider(provider, userId)
|
||||
default:
|
||||
return new DefaultPreprocessProvider(provider)
|
||||
}
|
||||
|
||||
@ -11,6 +11,8 @@ export function getPreprocessProviderLogo(providerId: PreprocessProviderId) {
|
||||
return MistralLogo
|
||||
case 'mineru':
|
||||
return MinerULogo
|
||||
case 'open-mineru':
|
||||
return MinerULogo
|
||||
default:
|
||||
return undefined
|
||||
}
|
||||
@ -36,5 +38,11 @@ export const PREPROCESS_PROVIDER_CONFIG: Record<PreprocessProviderId, Preprocess
|
||||
official: 'https://mineru.net/',
|
||||
apiKey: 'https://mineru.net/apiManage'
|
||||
}
|
||||
},
|
||||
'open-mineru': {
|
||||
websites: {
|
||||
official: 'https://github.com/opendatalab/MinerU/',
|
||||
apiKey: 'https://github.com/opendatalab/MinerU/'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -57,7 +57,7 @@ export const useKnowledgeBaseForm = (base?: KnowledgeBase) => {
|
||||
label: t('settings.tool.preprocess.provider'),
|
||||
title: t('settings.tool.preprocess.provider'),
|
||||
options: preprocessProviders
|
||||
.filter((p) => p.apiKey !== '' || p.id === 'mineru')
|
||||
.filter((p) => p.apiKey !== '' || ['mineru', 'open-mineru'].includes(p.id))
|
||||
.map((p) => ({ value: p.id, label: p.name }))
|
||||
}
|
||||
return [preprocessOptions]
|
||||
|
||||
@ -71,7 +71,6 @@ const PreprocessProviderSettings: FC<Props> = ({ provider: _provider }) => {
|
||||
<SettingTitle>
|
||||
<Flex align="center" gap={8}>
|
||||
<ProviderLogo shape="square" src={getPreprocessProviderLogo(preprocessProvider.id)} size={16} />
|
||||
|
||||
<ProviderName> {preprocessProvider.name}</ProviderName>
|
||||
{officialWebsite && preprocessProviderConfig?.websites && (
|
||||
<Link target="_blank" href={preprocessProviderConfig.websites.official}>
|
||||
|
||||
@ -65,7 +65,7 @@ const persistedReducer = persistReducer(
|
||||
{
|
||||
key: 'cherry-studio',
|
||||
storage,
|
||||
version: 167,
|
||||
version: 168,
|
||||
blacklist: ['runtime', 'messages', 'messageBlocks', 'tabs'],
|
||||
migrate
|
||||
},
|
||||
|
||||
@ -20,6 +20,7 @@ import { DEFAULT_SIDEBAR_ICONS } from '@renderer/config/sidebar'
|
||||
import db from '@renderer/databases'
|
||||
import i18n from '@renderer/i18n'
|
||||
import { DEFAULT_ASSISTANT_SETTINGS } from '@renderer/services/AssistantService'
|
||||
import { defaultPreprocessProviders } from '@renderer/store/preprocess'
|
||||
import {
|
||||
Assistant,
|
||||
BuiltinOcrProvider,
|
||||
@ -201,6 +202,18 @@ function addShortcuts(state: RootState, ids: string[], afterId: string) {
|
||||
}
|
||||
}
|
||||
|
||||
// add preprocess provider
|
||||
function addPreprocessProviders(state: RootState, id: string) {
|
||||
if (state.preprocess && state.preprocess.providers) {
|
||||
if (!state.preprocess.providers.find((p) => p.id === id)) {
|
||||
const provider = defaultPreprocessProviders.find((p) => p.id === id)
|
||||
if (provider) {
|
||||
state.preprocess.providers.push({ ...provider })
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const migrateConfig = {
|
||||
'2': (state: RootState) => {
|
||||
try {
|
||||
@ -2729,6 +2742,15 @@ const migrateConfig = {
|
||||
logger.error('migrate 167 error', error as Error)
|
||||
return state
|
||||
}
|
||||
},
|
||||
'168': (state: RootState) => {
|
||||
try {
|
||||
addPreprocessProviders(state, 'open-mineru')
|
||||
return state
|
||||
} catch (error) {
|
||||
logger.error('migrate 168 error', error as Error)
|
||||
return state
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -26,10 +26,19 @@ const initialState: PreprocessState = {
|
||||
model: 'mistral-ocr-latest',
|
||||
apiKey: '',
|
||||
apiHost: 'https://api.mistral.ai'
|
||||
},
|
||||
{
|
||||
id: 'open-mineru',
|
||||
name: 'Open MinerU',
|
||||
apiKey: '',
|
||||
apiHost: ''
|
||||
}
|
||||
],
|
||||
defaultProvider: 'mineru'
|
||||
}
|
||||
|
||||
export const defaultPreprocessProviders = initialState.providers
|
||||
|
||||
const preprocessSlice = createSlice({
|
||||
name: 'preprocess',
|
||||
initialState,
|
||||
|
||||
@ -107,7 +107,8 @@ export type ProcessingStatus = 'pending' | 'processing' | 'completed' | 'failed'
|
||||
export const PreprocessProviderIds = {
|
||||
doc2x: 'doc2x',
|
||||
mistral: 'mistral',
|
||||
mineru: 'mineru'
|
||||
mineru: 'mineru',
|
||||
'open-mineru': 'open-mineru'
|
||||
} as const
|
||||
|
||||
export type PreprocessProviderId = keyof typeof PreprocessProviderIds
|
||||
|
||||
Loading…
Reference in New Issue
Block a user