From 888a1833283c43df56cd1d0ab0c9929b901d0c7e Mon Sep 17 00:00:00 2001 From: Carlton Date: Wed, 29 Oct 2025 09:19:18 +0800 Subject: [PATCH] feat(knowledge, preprocess): Add OpenMinerU preprocessor, knowledge base supports using open-source version of MinerU to process documents (#10992) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(knowledge): 为文档处理增加open-mineru的支持 * 移除OpenMineruPreprocessProvider多余的apiKey处理 * Add preprocessProviders migrator for open-mineru * Remove invalid code from OpenMineruPreprocessProvider * fix property '"open-mineru"' is missing in PREPROCESS_PROVIDER_CONFIG * refactor(preprocess): improve OpenMinerU error handling and file cleanup --- .../OpenMineruPreprocessProvider.ts | 199 ++++++++++++++++++ .../preprocess/PreprocessProviderFactory.ts | 3 + .../src/config/preprocessProviders.ts | 8 + .../src/hooks/useKnowledgeBaseForm.ts | 2 +- .../PreprocessProviderSettings.tsx | 1 - src/renderer/src/store/index.ts | 2 +- src/renderer/src/store/migrate.ts | 22 ++ src/renderer/src/store/preprocess.ts | 9 + src/renderer/src/types/knowledge.ts | 3 +- 9 files changed, 245 insertions(+), 4 deletions(-) create mode 100644 src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts diff --git a/src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts b/src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts new file mode 100644 index 0000000000..4225b87003 --- /dev/null +++ b/src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts @@ -0,0 +1,199 @@ +import fs from 'node:fs' +import path from 'node:path' + +import { loggerService } from '@logger' +import { fileStorage } from '@main/services/FileStorage' +import { FileMetadata, PreprocessProvider } from '@types' +import AdmZip from 'adm-zip' +import { net } from 'electron' +import FormData from 'form-data' + +import BasePreprocessProvider from './BasePreprocessProvider' + +const logger = loggerService.withContext('MineruPreprocessProvider') + +export default class OpenMineruPreprocessProvider extends BasePreprocessProvider { + constructor(provider: PreprocessProvider, userId?: string) { + super(provider, userId) + } + + public async parseFile( + sourceId: string, + file: FileMetadata + ): Promise<{ processedFile: FileMetadata; quota: number }> { + try { + const filePath = fileStorage.getFilePathById(file) + logger.info(`Open MinerU preprocess processing started: ${filePath}`) + await this.validateFile(filePath) + + // 1. Update progress + await this.sendPreprocessProgress(sourceId, 50) + logger.info(`File ${file.name} is starting processing...`) + + // 2. Upload file and extract + const { path: outputPath } = await this.uploadFileAndExtract(file) + + // 3. Check quota + const quota = await this.checkQuota() + + // 4. Create processed file info + return { + processedFile: this.createProcessedFileInfo(file, outputPath), + quota + } + } catch (error) { + logger.error(`Open MinerU preprocess processing failed for:`, error as Error) + throw error + } + } + + public async checkQuota() { + // self-hosted version always has enough quota + return Infinity + } + + private async validateFile(filePath: string): Promise { + const pdfBuffer = await fs.promises.readFile(filePath) + + const doc = await this.readPdf(pdfBuffer) + + // File page count must be less than 600 pages + if (doc.numPages >= 600) { + throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 600 pages`) + } + // File size must be less than 200MB + if (pdfBuffer.length >= 200 * 1024 * 1024) { + const fileSizeMB = Math.round(pdfBuffer.length / (1024 * 1024)) + throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 200MB`) + } + } + + private createProcessedFileInfo(file: FileMetadata, outputPath: string): FileMetadata { + // Find the main file after extraction + let finalPath = '' + let finalName = file.origin_name.replace('.pdf', '.md') + // Find the corresponding folder by file name + outputPath = path.join(outputPath, `${file.origin_name.replace('.pdf', '')}`) + try { + const files = fs.readdirSync(outputPath) + + const mdFile = files.find((f) => f.endsWith('.md')) + if (mdFile) { + const originalMdPath = path.join(outputPath, mdFile) + const newMdPath = path.join(outputPath, finalName) + + // Rename file to original file name + try { + fs.renameSync(originalMdPath, newMdPath) + finalPath = newMdPath + logger.info(`Renamed markdown file from ${mdFile} to ${finalName}`) + } catch (renameError) { + logger.warn(`Failed to rename file ${mdFile} to ${finalName}: ${renameError}`) + // If rename fails, use the original file + finalPath = originalMdPath + finalName = mdFile + } + } + } catch (error) { + logger.warn(`Failed to read output directory ${outputPath}:`, error as Error) + finalPath = path.join(outputPath, `${file.id}.md`) + } + + return { + ...file, + name: finalName, + path: finalPath, + ext: '.md', + size: fs.existsSync(finalPath) ? fs.statSync(finalPath).size : 0 + } + } + + private async uploadFileAndExtract( + file: FileMetadata, + maxRetries: number = 5, + intervalMs: number = 5000 + ): Promise<{ path: string }> { + let retries = 0 + + const endpoint = `${this.provider.apiHost}/file_parse` + + // Get file stream + const filePath = fileStorage.getFilePathById(file) + const fileBuffer = await fs.promises.readFile(filePath) + + const formData = new FormData() + formData.append('return_md', 'true') + formData.append('response_format_zip', 'true') + formData.append('files', fileBuffer, { + filename: file.origin_name + }) + + while (retries < maxRetries) { + let zipPath: string | undefined + + try { + const response = await net.fetch(endpoint, { + method: 'POST', + headers: { + token: this.userId ?? '', + ...(this.provider.apiKey ? { Authorization: `Bearer ${this.provider.apiKey}` } : {}), + ...formData.getHeaders() + }, + body: formData.getBuffer() + }) + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`) + } + + // Check if response header is application/zip + if (response.headers.get('content-type') !== 'application/zip') { + throw new Error(`Downloaded ZIP file has unexpected content-type: ${response.headers.get('content-type')}`) + } + + const dirPath = this.storageDir + + zipPath = path.join(dirPath, `${file.id}.zip`) + const extractPath = path.join(dirPath, `${file.id}`) + + const arrayBuffer = await response.arrayBuffer() + fs.writeFileSync(zipPath, Buffer.from(arrayBuffer)) + logger.info(`Downloaded ZIP file: ${zipPath}`) + + // Ensure extraction directory exists + if (!fs.existsSync(extractPath)) { + fs.mkdirSync(extractPath, { recursive: true }) + } + + // Extract files + const zip = new AdmZip(zipPath) + zip.extractAllTo(extractPath, true) + logger.info(`Extracted files to: ${extractPath}`) + + return { path: extractPath } + } catch (error) { + logger.warn( + `Failed to upload and extract file: ${(error as Error).message}, retry ${retries + 1}/${maxRetries}` + ) + if (retries === maxRetries - 1) { + throw error + } + } finally { + // Delete temporary ZIP file + if (zipPath && fs.existsSync(zipPath)) { + try { + fs.unlinkSync(zipPath) + logger.info(`Deleted temporary ZIP file: ${zipPath}`) + } catch (deleteError) { + logger.warn(`Failed to delete temporary ZIP file ${zipPath}:`, deleteError as Error) + } + } + } + + retries++ + await new Promise((resolve) => setTimeout(resolve, intervalMs)) + } + + throw new Error(`Processing timeout for file: ${file.id}`) + } +} diff --git a/src/main/knowledge/preprocess/PreprocessProviderFactory.ts b/src/main/knowledge/preprocess/PreprocessProviderFactory.ts index bebecd388f..e824601749 100644 --- a/src/main/knowledge/preprocess/PreprocessProviderFactory.ts +++ b/src/main/knowledge/preprocess/PreprocessProviderFactory.ts @@ -5,6 +5,7 @@ import DefaultPreprocessProvider from './DefaultPreprocessProvider' import Doc2xPreprocessProvider from './Doc2xPreprocessProvider' import MineruPreprocessProvider from './MineruPreprocessProvider' import MistralPreprocessProvider from './MistralPreprocessProvider' +import OpenMineruPreprocessProvider from './OpenMineruPreprocessProvider' export default class PreprocessProviderFactory { static create(provider: PreprocessProvider, userId?: string): BasePreprocessProvider { switch (provider.id) { @@ -14,6 +15,8 @@ export default class PreprocessProviderFactory { return new MistralPreprocessProvider(provider) case 'mineru': return new MineruPreprocessProvider(provider, userId) + case 'open-mineru': + return new OpenMineruPreprocessProvider(provider, userId) default: return new DefaultPreprocessProvider(provider) } diff --git a/src/renderer/src/config/preprocessProviders.ts b/src/renderer/src/config/preprocessProviders.ts index 88215b328d..c01880eb95 100644 --- a/src/renderer/src/config/preprocessProviders.ts +++ b/src/renderer/src/config/preprocessProviders.ts @@ -11,6 +11,8 @@ export function getPreprocessProviderLogo(providerId: PreprocessProviderId) { return MistralLogo case 'mineru': return MinerULogo + case 'open-mineru': + return MinerULogo default: return undefined } @@ -36,5 +38,11 @@ export const PREPROCESS_PROVIDER_CONFIG: Record { label: t('settings.tool.preprocess.provider'), title: t('settings.tool.preprocess.provider'), options: preprocessProviders - .filter((p) => p.apiKey !== '' || p.id === 'mineru') + .filter((p) => p.apiKey !== '' || ['mineru', 'open-mineru'].includes(p.id)) .map((p) => ({ value: p.id, label: p.name })) } return [preprocessOptions] diff --git a/src/renderer/src/pages/settings/DocProcessSettings/PreprocessProviderSettings.tsx b/src/renderer/src/pages/settings/DocProcessSettings/PreprocessProviderSettings.tsx index d19eec4d6d..56fea50ec8 100644 --- a/src/renderer/src/pages/settings/DocProcessSettings/PreprocessProviderSettings.tsx +++ b/src/renderer/src/pages/settings/DocProcessSettings/PreprocessProviderSettings.tsx @@ -71,7 +71,6 @@ const PreprocessProviderSettings: FC = ({ provider: _provider }) => { - {preprocessProvider.name} {officialWebsite && preprocessProviderConfig?.websites && ( diff --git a/src/renderer/src/store/index.ts b/src/renderer/src/store/index.ts index 85afd68cb9..27bddaefc5 100644 --- a/src/renderer/src/store/index.ts +++ b/src/renderer/src/store/index.ts @@ -65,7 +65,7 @@ const persistedReducer = persistReducer( { key: 'cherry-studio', storage, - version: 167, + version: 168, blacklist: ['runtime', 'messages', 'messageBlocks', 'tabs'], migrate }, diff --git a/src/renderer/src/store/migrate.ts b/src/renderer/src/store/migrate.ts index 7b7a834838..19346dace4 100644 --- a/src/renderer/src/store/migrate.ts +++ b/src/renderer/src/store/migrate.ts @@ -20,6 +20,7 @@ import { DEFAULT_SIDEBAR_ICONS } from '@renderer/config/sidebar' import db from '@renderer/databases' import i18n from '@renderer/i18n' import { DEFAULT_ASSISTANT_SETTINGS } from '@renderer/services/AssistantService' +import { defaultPreprocessProviders } from '@renderer/store/preprocess' import { Assistant, BuiltinOcrProvider, @@ -201,6 +202,18 @@ function addShortcuts(state: RootState, ids: string[], afterId: string) { } } +// add preprocess provider +function addPreprocessProviders(state: RootState, id: string) { + if (state.preprocess && state.preprocess.providers) { + if (!state.preprocess.providers.find((p) => p.id === id)) { + const provider = defaultPreprocessProviders.find((p) => p.id === id) + if (provider) { + state.preprocess.providers.push({ ...provider }) + } + } + } +} + const migrateConfig = { '2': (state: RootState) => { try { @@ -2729,6 +2742,15 @@ const migrateConfig = { logger.error('migrate 167 error', error as Error) return state } + }, + '168': (state: RootState) => { + try { + addPreprocessProviders(state, 'open-mineru') + return state + } catch (error) { + logger.error('migrate 168 error', error as Error) + return state + } } } diff --git a/src/renderer/src/store/preprocess.ts b/src/renderer/src/store/preprocess.ts index 0dfaca4d08..2beab719f6 100644 --- a/src/renderer/src/store/preprocess.ts +++ b/src/renderer/src/store/preprocess.ts @@ -26,10 +26,19 @@ const initialState: PreprocessState = { model: 'mistral-ocr-latest', apiKey: '', apiHost: 'https://api.mistral.ai' + }, + { + id: 'open-mineru', + name: 'Open MinerU', + apiKey: '', + apiHost: '' } ], defaultProvider: 'mineru' } + +export const defaultPreprocessProviders = initialState.providers + const preprocessSlice = createSlice({ name: 'preprocess', initialState, diff --git a/src/renderer/src/types/knowledge.ts b/src/renderer/src/types/knowledge.ts index c32e6dd003..d168349c85 100644 --- a/src/renderer/src/types/knowledge.ts +++ b/src/renderer/src/types/knowledge.ts @@ -107,7 +107,8 @@ export type ProcessingStatus = 'pending' | 'processing' | 'completed' | 'failed' export const PreprocessProviderIds = { doc2x: 'doc2x', mistral: 'mistral', - mineru: 'mineru' + mineru: 'mineru', + 'open-mineru': 'open-mineru' } as const export type PreprocessProviderId = keyof typeof PreprocessProviderIds