cherry-studio/src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts
kangfenmao 61dd97bd0e fix: correct OpenMineru file parameter from 'files' to 'file'
Fixes issue #11252 where the locally deployed mineru service was not working
properly due to incorrect FastAPI request body. The file field was always None
because the parameter name was 'files' instead of 'file'.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 10:49:18 +08:00

200 lines
6.4 KiB
TypeScript

import fs from 'node:fs'
import path from 'node:path'
import { loggerService } from '@logger'
import { fileStorage } from '@main/services/FileStorage'
import type { FileMetadata, PreprocessProvider } from '@types'
import AdmZip from 'adm-zip'
import { net } from 'electron'
import FormData from 'form-data'
import BasePreprocessProvider from './BasePreprocessProvider'
const logger = loggerService.withContext('MineruPreprocessProvider')
export default class OpenMineruPreprocessProvider extends BasePreprocessProvider {
constructor(provider: PreprocessProvider, userId?: string) {
super(provider, userId)
}
public async parseFile(
sourceId: string,
file: FileMetadata
): Promise<{ processedFile: FileMetadata; quota: number }> {
try {
const filePath = fileStorage.getFilePathById(file)
logger.info(`Open MinerU preprocess processing started: ${filePath}`)
await this.validateFile(filePath)
// 1. Update progress
await this.sendPreprocessProgress(sourceId, 50)
logger.info(`File ${file.name} is starting processing...`)
// 2. Upload file and extract
const { path: outputPath } = await this.uploadFileAndExtract(file)
// 3. Check quota
const quota = await this.checkQuota()
// 4. Create processed file info
return {
processedFile: this.createProcessedFileInfo(file, outputPath),
quota
}
} catch (error) {
logger.error(`Open MinerU preprocess processing failed for:`, error as Error)
throw error
}
}
public async checkQuota() {
// self-hosted version always has enough quota
return Infinity
}
private async validateFile(filePath: string): Promise<void> {
const pdfBuffer = await fs.promises.readFile(filePath)
const doc = await this.readPdf(pdfBuffer)
// File page count must be less than 600 pages
if (doc.numPages >= 600) {
throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 600 pages`)
}
// File size must be less than 200MB
if (pdfBuffer.length >= 200 * 1024 * 1024) {
const fileSizeMB = Math.round(pdfBuffer.length / (1024 * 1024))
throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 200MB`)
}
}
private createProcessedFileInfo(file: FileMetadata, outputPath: string): FileMetadata {
// Find the main file after extraction
let finalPath = ''
let finalName = file.origin_name.replace('.pdf', '.md')
// Find the corresponding folder by file name
outputPath = path.join(outputPath, `${file.origin_name.replace('.pdf', '')}`)
try {
const files = fs.readdirSync(outputPath)
const mdFile = files.find((f) => f.endsWith('.md'))
if (mdFile) {
const originalMdPath = path.join(outputPath, mdFile)
const newMdPath = path.join(outputPath, finalName)
// Rename file to original file name
try {
fs.renameSync(originalMdPath, newMdPath)
finalPath = newMdPath
logger.info(`Renamed markdown file from ${mdFile} to ${finalName}`)
} catch (renameError) {
logger.warn(`Failed to rename file ${mdFile} to ${finalName}: ${renameError}`)
// If rename fails, use the original file
finalPath = originalMdPath
finalName = mdFile
}
}
} catch (error) {
logger.warn(`Failed to read output directory ${outputPath}:`, error as Error)
finalPath = path.join(outputPath, `${file.id}.md`)
}
return {
...file,
name: finalName,
path: finalPath,
ext: '.md',
size: fs.existsSync(finalPath) ? fs.statSync(finalPath).size : 0
}
}
private async uploadFileAndExtract(
file: FileMetadata,
maxRetries: number = 5,
intervalMs: number = 5000
): Promise<{ path: string }> {
let retries = 0
const endpoint = `${this.provider.apiHost}/file_parse`
// Get file stream
const filePath = fileStorage.getFilePathById(file)
const fileBuffer = await fs.promises.readFile(filePath)
const formData = new FormData()
formData.append('return_md', 'true')
formData.append('response_format_zip', 'true')
formData.append('file', fileBuffer, {
filename: file.origin_name
})
while (retries < maxRetries) {
let zipPath: string | undefined
try {
const response = await net.fetch(endpoint, {
method: 'POST',
headers: {
token: this.userId ?? '',
...(this.provider.apiKey ? { Authorization: `Bearer ${this.provider.apiKey}` } : {}),
...formData.getHeaders()
},
body: formData.getBuffer()
})
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`)
}
// Check if response header is application/zip
if (response.headers.get('content-type') !== 'application/zip') {
throw new Error(`Downloaded ZIP file has unexpected content-type: ${response.headers.get('content-type')}`)
}
const dirPath = this.storageDir
zipPath = path.join(dirPath, `${file.id}.zip`)
const extractPath = path.join(dirPath, `${file.id}`)
const arrayBuffer = await response.arrayBuffer()
fs.writeFileSync(zipPath, Buffer.from(arrayBuffer))
logger.info(`Downloaded ZIP file: ${zipPath}`)
// Ensure extraction directory exists
if (!fs.existsSync(extractPath)) {
fs.mkdirSync(extractPath, { recursive: true })
}
// Extract files
const zip = new AdmZip(zipPath)
zip.extractAllTo(extractPath, true)
logger.info(`Extracted files to: ${extractPath}`)
return { path: extractPath }
} catch (error) {
logger.warn(
`Failed to upload and extract file: ${(error as Error).message}, retry ${retries + 1}/${maxRetries}`
)
if (retries === maxRetries - 1) {
throw error
}
} finally {
// Delete temporary ZIP file
if (zipPath && fs.existsSync(zipPath)) {
try {
fs.unlinkSync(zipPath)
logger.info(`Deleted temporary ZIP file: ${zipPath}`)
} catch (deleteError) {
logger.warn(`Failed to delete temporary ZIP file ${zipPath}:`, deleteError as Error)
}
}
}
retries++
await new Promise((resolve) => setTimeout(resolve, intervalMs))
}
throw new Error(`Processing timeout for file: ${file.id}`)
}
}