feat(knowledge, preprocess): Add OpenMinerU preprocessor, knowledge base supports using open-source version of MinerU to process documents (#10992)

* feat(knowledge): 为文档处理增加open-mineru的支持

* 移除OpenMineruPreprocessProvider多余的apiKey处理

* Add preprocessProviders migrator for open-mineru

* Remove invalid code from OpenMineruPreprocessProvider

* fix property '"open-mineru"' is missing in PREPROCESS_PROVIDER_CONFIG

* refactor(preprocess): improve OpenMinerU error handling and file cleanup
This commit is contained in:
Carlton 2025-10-29 09:19:18 +08:00 committed by GitHub
parent 9a01e092f6
commit 888a183328
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 245 additions and 4 deletions

View File

@ -0,0 +1,199 @@
import fs from 'node:fs'
import path from 'node:path'
import { loggerService } from '@logger'
import { fileStorage } from '@main/services/FileStorage'
import { FileMetadata, PreprocessProvider } from '@types'
import AdmZip from 'adm-zip'
import { net } from 'electron'
import FormData from 'form-data'
import BasePreprocessProvider from './BasePreprocessProvider'
const logger = loggerService.withContext('MineruPreprocessProvider')
export default class OpenMineruPreprocessProvider extends BasePreprocessProvider {
constructor(provider: PreprocessProvider, userId?: string) {
super(provider, userId)
}
public async parseFile(
sourceId: string,
file: FileMetadata
): Promise<{ processedFile: FileMetadata; quota: number }> {
try {
const filePath = fileStorage.getFilePathById(file)
logger.info(`Open MinerU preprocess processing started: ${filePath}`)
await this.validateFile(filePath)
// 1. Update progress
await this.sendPreprocessProgress(sourceId, 50)
logger.info(`File ${file.name} is starting processing...`)
// 2. Upload file and extract
const { path: outputPath } = await this.uploadFileAndExtract(file)
// 3. Check quota
const quota = await this.checkQuota()
// 4. Create processed file info
return {
processedFile: this.createProcessedFileInfo(file, outputPath),
quota
}
} catch (error) {
logger.error(`Open MinerU preprocess processing failed for:`, error as Error)
throw error
}
}
public async checkQuota() {
// self-hosted version always has enough quota
return Infinity
}
private async validateFile(filePath: string): Promise<void> {
const pdfBuffer = await fs.promises.readFile(filePath)
const doc = await this.readPdf(pdfBuffer)
// File page count must be less than 600 pages
if (doc.numPages >= 600) {
throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 600 pages`)
}
// File size must be less than 200MB
if (pdfBuffer.length >= 200 * 1024 * 1024) {
const fileSizeMB = Math.round(pdfBuffer.length / (1024 * 1024))
throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 200MB`)
}
}
private createProcessedFileInfo(file: FileMetadata, outputPath: string): FileMetadata {
// Find the main file after extraction
let finalPath = ''
let finalName = file.origin_name.replace('.pdf', '.md')
// Find the corresponding folder by file name
outputPath = path.join(outputPath, `${file.origin_name.replace('.pdf', '')}`)
try {
const files = fs.readdirSync(outputPath)
const mdFile = files.find((f) => f.endsWith('.md'))
if (mdFile) {
const originalMdPath = path.join(outputPath, mdFile)
const newMdPath = path.join(outputPath, finalName)
// Rename file to original file name
try {
fs.renameSync(originalMdPath, newMdPath)
finalPath = newMdPath
logger.info(`Renamed markdown file from ${mdFile} to ${finalName}`)
} catch (renameError) {
logger.warn(`Failed to rename file ${mdFile} to ${finalName}: ${renameError}`)
// If rename fails, use the original file
finalPath = originalMdPath
finalName = mdFile
}
}
} catch (error) {
logger.warn(`Failed to read output directory ${outputPath}:`, error as Error)
finalPath = path.join(outputPath, `${file.id}.md`)
}
return {
...file,
name: finalName,
path: finalPath,
ext: '.md',
size: fs.existsSync(finalPath) ? fs.statSync(finalPath).size : 0
}
}
private async uploadFileAndExtract(
file: FileMetadata,
maxRetries: number = 5,
intervalMs: number = 5000
): Promise<{ path: string }> {
let retries = 0
const endpoint = `${this.provider.apiHost}/file_parse`
// Get file stream
const filePath = fileStorage.getFilePathById(file)
const fileBuffer = await fs.promises.readFile(filePath)
const formData = new FormData()
formData.append('return_md', 'true')
formData.append('response_format_zip', 'true')
formData.append('files', fileBuffer, {
filename: file.origin_name
})
while (retries < maxRetries) {
let zipPath: string | undefined
try {
const response = await net.fetch(endpoint, {
method: 'POST',
headers: {
token: this.userId ?? '',
...(this.provider.apiKey ? { Authorization: `Bearer ${this.provider.apiKey}` } : {}),
...formData.getHeaders()
},
body: formData.getBuffer()
})
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
// Check if response header is application/zip
if (response.headers.get('content-type') !== 'application/zip') {
throw new Error(`Downloaded ZIP file has unexpected content-type: ${response.headers.get('content-type')}`)
}
const dirPath = this.storageDir
zipPath = path.join(dirPath, `${file.id}.zip`)
const extractPath = path.join(dirPath, `${file.id}`)
const arrayBuffer = await response.arrayBuffer()
fs.writeFileSync(zipPath, Buffer.from(arrayBuffer))
logger.info(`Downloaded ZIP file: ${zipPath}`)
// Ensure extraction directory exists
if (!fs.existsSync(extractPath)) {
fs.mkdirSync(extractPath, { recursive: true })
}
// Extract files
const zip = new AdmZip(zipPath)
zip.extractAllTo(extractPath, true)
logger.info(`Extracted files to: ${extractPath}`)
return { path: extractPath }
} catch (error) {
logger.warn(
`Failed to upload and extract file: ${(error as Error).message}, retry ${retries + 1}/${maxRetries}`
)
if (retries === maxRetries - 1) {
throw error
}
} finally {
// Delete temporary ZIP file
if (zipPath && fs.existsSync(zipPath)) {
try {
fs.unlinkSync(zipPath)
logger.info(`Deleted temporary ZIP file: ${zipPath}`)
} catch (deleteError) {
logger.warn(`Failed to delete temporary ZIP file ${zipPath}:`, deleteError as Error)
}
}
}
retries++
await new Promise((resolve) => setTimeout(resolve, intervalMs))
}
throw new Error(`Processing timeout for file: ${file.id}`)
}
}

View File

@ -5,6 +5,7 @@ import DefaultPreprocessProvider from './DefaultPreprocessProvider'
import Doc2xPreprocessProvider from './Doc2xPreprocessProvider'
import MineruPreprocessProvider from './MineruPreprocessProvider'
import MistralPreprocessProvider from './MistralPreprocessProvider'
import OpenMineruPreprocessProvider from './OpenMineruPreprocessProvider'
export default class PreprocessProviderFactory {
static create(provider: PreprocessProvider, userId?: string): BasePreprocessProvider {
switch (provider.id) {
@ -14,6 +15,8 @@ export default class PreprocessProviderFactory {
return new MistralPreprocessProvider(provider)
case 'mineru':
return new MineruPreprocessProvider(provider, userId)
case 'open-mineru':
return new OpenMineruPreprocessProvider(provider, userId)
default:
return new DefaultPreprocessProvider(provider)
}

View File

@ -11,6 +11,8 @@ export function getPreprocessProviderLogo(providerId: PreprocessProviderId) {
return MistralLogo
case 'mineru':
return MinerULogo
case 'open-mineru':
return MinerULogo
default:
return undefined
}
@ -36,5 +38,11 @@ export const PREPROCESS_PROVIDER_CONFIG: Record<PreprocessProviderId, Preprocess
official: 'https://mineru.net/',
apiKey: 'https://mineru.net/apiManage'
}
},
'open-mineru': {
websites: {
official: 'https://github.com/opendatalab/MinerU/',
apiKey: 'https://github.com/opendatalab/MinerU/'
}
}
}

View File

@ -57,7 +57,7 @@ export const useKnowledgeBaseForm = (base?: KnowledgeBase) => {
label: t('settings.tool.preprocess.provider'),
title: t('settings.tool.preprocess.provider'),
options: preprocessProviders
.filter((p) => p.apiKey !== '' || p.id === 'mineru')
.filter((p) => p.apiKey !== '' || ['mineru', 'open-mineru'].includes(p.id))
.map((p) => ({ value: p.id, label: p.name }))
}
return [preprocessOptions]

View File

@ -71,7 +71,6 @@ const PreprocessProviderSettings: FC<Props> = ({ provider: _provider }) => {
<SettingTitle>
<Flex align="center" gap={8}>
<ProviderLogo shape="square" src={getPreprocessProviderLogo(preprocessProvider.id)} size={16} />
<ProviderName> {preprocessProvider.name}</ProviderName>
{officialWebsite && preprocessProviderConfig?.websites && (
<Link target="_blank" href={preprocessProviderConfig.websites.official}>

View File

@ -65,7 +65,7 @@ const persistedReducer = persistReducer(
{
key: 'cherry-studio',
storage,
version: 167,
version: 168,
blacklist: ['runtime', 'messages', 'messageBlocks', 'tabs'],
migrate
},

View File

@ -20,6 +20,7 @@ import { DEFAULT_SIDEBAR_ICONS } from '@renderer/config/sidebar'
import db from '@renderer/databases'
import i18n from '@renderer/i18n'
import { DEFAULT_ASSISTANT_SETTINGS } from '@renderer/services/AssistantService'
import { defaultPreprocessProviders } from '@renderer/store/preprocess'
import {
Assistant,
BuiltinOcrProvider,
@ -201,6 +202,18 @@ function addShortcuts(state: RootState, ids: string[], afterId: string) {
}
}
// add preprocess provider
function addPreprocessProviders(state: RootState, id: string) {
if (state.preprocess && state.preprocess.providers) {
if (!state.preprocess.providers.find((p) => p.id === id)) {
const provider = defaultPreprocessProviders.find((p) => p.id === id)
if (provider) {
state.preprocess.providers.push({ ...provider })
}
}
}
}
const migrateConfig = {
'2': (state: RootState) => {
try {
@ -2729,6 +2742,15 @@ const migrateConfig = {
logger.error('migrate 167 error', error as Error)
return state
}
},
'168': (state: RootState) => {
try {
addPreprocessProviders(state, 'open-mineru')
return state
} catch (error) {
logger.error('migrate 168 error', error as Error)
return state
}
}
}

View File

@ -26,10 +26,19 @@ const initialState: PreprocessState = {
model: 'mistral-ocr-latest',
apiKey: '',
apiHost: 'https://api.mistral.ai'
},
{
id: 'open-mineru',
name: 'Open MinerU',
apiKey: '',
apiHost: ''
}
],
defaultProvider: 'mineru'
}
export const defaultPreprocessProviders = initialState.providers
const preprocessSlice = createSlice({
name: 'preprocess',
initialState,

View File

@ -107,7 +107,8 @@ export type ProcessingStatus = 'pending' | 'processing' | 'completed' | 'failed'
export const PreprocessProviderIds = {
doc2x: 'doc2x',
mistral: 'mistral',
mineru: 'mineru'
mineru: 'mineru',
'open-mineru': 'open-mineru'
} as const
export type PreprocessProviderId = keyof typeof PreprocessProviderIds