diff --git a/src/main/knowledge/preprocess/MineruPreprocessProvider.ts b/src/main/knowledge/preprocess/MineruPreprocessProvider.ts index 7a5362a11..80aec4062 100644 --- a/src/main/knowledge/preprocess/MineruPreprocessProvider.ts +++ b/src/main/knowledge/preprocess/MineruPreprocessProvider.ts @@ -21,6 +21,7 @@ type ApiResponse = { type BatchUploadResponse = { batch_id: string file_urls: string[] + headers?: Record[] } type ExtractProgress = { @@ -55,7 +56,7 @@ type QuotaResponse = { export default class MineruPreprocessProvider extends BasePreprocessProvider { constructor(provider: PreprocessProvider, userId?: string) { super(provider, userId) - // todo:免费期结束后删除 + // TODO: remove after free period ends this.provider.apiKey = this.provider.apiKey || import.meta.env.MAIN_VITE_MINERU_API_KEY } @@ -68,21 +69,21 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { logger.info(`MinerU preprocess processing started: ${filePath}`) await this.validateFile(filePath) - // 1. 获取上传URL并上传文件 + // 1. Get upload URL and upload file const batchId = await this.uploadFile(file) logger.info(`MinerU file upload completed: batch_id=${batchId}`) - // 2. 等待处理完成并获取结果 + // 2. Wait for completion and fetch results const extractResult = await this.waitForCompletion(sourceId, batchId, file.origin_name) logger.info(`MinerU processing completed for batch: ${batchId}`) - // 3. 下载并解压文件 + // 3. Download and extract output const { path: outputPath } = await this.downloadAndExtractFile(extractResult.full_zip_url!, file) // 4. check quota const quota = await this.checkQuota() - // 5. 创建处理后的文件信息 + // 5. Create processed file metadata return { processedFile: this.createProcessedFileInfo(file, outputPath), quota @@ -115,23 +116,48 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { } private async validateFile(filePath: string): Promise { + // Phase 1: check file size (without loading into memory) + logger.info(`Validating PDF file: ${filePath}`) + const stats = await fs.promises.stat(filePath) + const fileSizeBytes = stats.size + + // Ensure file size is under 200MB + if (fileSizeBytes >= 200 * 1024 * 1024) { + const fileSizeMB = Math.round(fileSizeBytes / (1024 * 1024)) + throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 200MB`) + } + + // Phase 2: check page count (requires reading file with error handling) const pdfBuffer = await fs.promises.readFile(filePath) - const doc = await this.readPdf(pdfBuffer) + try { + const doc = await this.readPdf(pdfBuffer) - // 文件页数小于600页 - if (doc.numPages >= 600) { - throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 600 pages`) - } - // 文件大小小于200MB - if (pdfBuffer.length >= 200 * 1024 * 1024) { - const fileSizeMB = Math.round(pdfBuffer.length / (1024 * 1024)) - throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 200MB`) + // Ensure page count is under 600 pages + if (doc.numPages >= 600) { + throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 600 pages`) + } + + logger.info(`PDF validation passed: ${doc.numPages} pages, ${Math.round(fileSizeBytes / (1024 * 1024))}MB`) + } catch (error: any) { + // If the page limit is exceeded, rethrow immediately + if (error.message.includes('exceeds the limit')) { + throw error + } + + // If PDF parsing fails, log a detailed warning but continue processing + logger.warn( + `Failed to parse PDF structure (file may be corrupted or use non-standard format). ` + + `Skipping page count validation. Will attempt to process with MinerU API. ` + + `Error details: ${error.message}. ` + + `Suggestion: If processing fails, try repairing the PDF using tools like Adobe Acrobat or online PDF repair services.` + ) + // Do not throw; continue processing } } private createProcessedFileInfo(file: FileMetadata, outputPath: string): FileMetadata { - // 查找解压后的主要文件 + // Locate the main extracted file let finalPath = '' let finalName = file.origin_name.replace('.pdf', '.md') @@ -143,14 +169,14 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { const originalMdPath = path.join(outputPath, mdFile) const newMdPath = path.join(outputPath, finalName) - // 重命名文件为原始文件名 + // Rename the file to match the original name try { fs.renameSync(originalMdPath, newMdPath) finalPath = newMdPath logger.info(`Renamed markdown file from ${mdFile} to ${finalName}`) } catch (renameError) { logger.warn(`Failed to rename file ${mdFile} to ${finalName}: ${renameError}`) - // 如果重命名失败,使用原文件 + // If renaming fails, fall back to the original file finalPath = originalMdPath finalName = mdFile } @@ -178,7 +204,7 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { logger.info(`Downloading MinerU result to: ${zipPath}`) try { - // 下载ZIP文件 + // Download the ZIP file const response = await net.fetch(zipUrl, { method: 'GET' }) if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`) @@ -187,17 +213,17 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { fs.writeFileSync(zipPath, Buffer.from(arrayBuffer)) logger.info(`Downloaded ZIP file: ${zipPath}`) - // 确保提取目录存在 + // Ensure the extraction directory exists if (!fs.existsSync(extractPath)) { fs.mkdirSync(extractPath, { recursive: true }) } - // 解压文件 + // Extract the ZIP contents const zip = new AdmZip(zipPath) zip.extractAllTo(extractPath, true) logger.info(`Extracted files to: ${extractPath}`) - // 删除临时ZIP文件 + // Remove the temporary ZIP file fs.unlinkSync(zipPath) return { path: extractPath } @@ -209,11 +235,11 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { private async uploadFile(file: FileMetadata): Promise { try { - // 步骤1: 获取上传URL - const { batchId, fileUrls } = await this.getBatchUploadUrls(file) - // 步骤2: 上传文件到获取的URL + // Step 1: obtain the upload URL + const { batchId, fileUrls, uploadHeaders } = await this.getBatchUploadUrls(file) + // Step 2: upload the file to the obtained URL const filePath = fileStorage.getFilePathById(file) - await this.putFileToUrl(filePath, fileUrls[0]) + await this.putFileToUrl(filePath, fileUrls[0], file.origin_name, uploadHeaders?.[0]) logger.info(`File uploaded successfully: ${filePath}`, { batchId, fileUrls }) return batchId @@ -223,7 +249,9 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { } } - private async getBatchUploadUrls(file: FileMetadata): Promise<{ batchId: string; fileUrls: string[] }> { + private async getBatchUploadUrls( + file: FileMetadata + ): Promise<{ batchId: string; fileUrls: string[]; uploadHeaders?: Record[] }> { const endpoint = `${this.provider.apiHost}/api/v4/file-urls/batch` const payload = { @@ -254,10 +282,11 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { if (response.ok) { const data: ApiResponse = await response.json() if (data.code === 0 && data.data) { - const { batch_id, file_urls } = data.data + const { batch_id, file_urls, headers: uploadHeaders } = data.data return { batchId: batch_id, - fileUrls: file_urls + fileUrls: file_urls, + uploadHeaders } } else { throw new Error(`API returned error: ${data.msg || JSON.stringify(data)}`) @@ -271,18 +300,28 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { } } - private async putFileToUrl(filePath: string, uploadUrl: string): Promise { + private async putFileToUrl( + filePath: string, + uploadUrl: string, + fileName?: string, + headers?: Record + ): Promise { try { const fileBuffer = await fs.promises.readFile(filePath) + const fileSize = fileBuffer.byteLength + const displayName = fileName ?? path.basename(filePath) + + logger.info(`Uploading file to MinerU OSS: ${displayName} (${fileSize} bytes)`) // https://mineru.net/apiManage/docs const response = await net.fetch(uploadUrl, { method: 'PUT', - body: fileBuffer + headers, + body: new Uint8Array(fileBuffer) }) if (!response.ok) { - // 克隆 response 以避免消费 body stream + // Clone the response to avoid consuming the body stream const responseClone = response.clone() try { @@ -353,20 +392,20 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { try { const result = await this.getExtractResults(batchId) - // 查找对应文件的处理结果 + // Find the corresponding file result const fileResult = result.extract_result.find((item) => item.file_name === fileName) if (!fileResult) { throw new Error(`File ${fileName} not found in batch results`) } - // 检查处理状态 + // Check the processing state if (fileResult.state === 'done' && fileResult.full_zip_url) { logger.info(`Processing completed for file: ${fileName}`) return fileResult } else if (fileResult.state === 'failed') { throw new Error(`Processing failed for file: ${fileName}, error: ${fileResult.err_msg}`) } else if (fileResult.state === 'running') { - // 发送进度更新 + // Send progress updates if (fileResult.extract_progress) { const progress = Math.round( (fileResult.extract_progress.extracted_pages / fileResult.extract_progress.total_pages) * 100 @@ -374,7 +413,7 @@ export default class MineruPreprocessProvider extends BasePreprocessProvider { await this.sendPreprocessProgress(sourceId, progress) logger.info(`File ${fileName} processing progress: ${progress}%`) } else { - // 如果没有具体进度信息,发送一个通用进度 + // If no detailed progress information is available, send a generic update await this.sendPreprocessProgress(sourceId, 50) logger.info(`File ${fileName} is still processing...`) } diff --git a/src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts b/src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts index 377eafa70..f322fbac3 100644 --- a/src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts +++ b/src/main/knowledge/preprocess/OpenMineruPreprocessProvider.ts @@ -53,18 +53,43 @@ export default class OpenMineruPreprocessProvider extends BasePreprocessProvider } private async validateFile(filePath: string): Promise { + // 第一阶段:检查文件大小(无需读取文件到内存) + logger.info(`Validating PDF file: ${filePath}`) + const stats = await fs.promises.stat(filePath) + const fileSizeBytes = stats.size + + // File size must be less than 200MB + if (fileSizeBytes >= 200 * 1024 * 1024) { + const fileSizeMB = Math.round(fileSizeBytes / (1024 * 1024)) + throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 200MB`) + } + + // 第二阶段:检查页数(需要读取文件,带错误处理) const pdfBuffer = await fs.promises.readFile(filePath) - const doc = await this.readPdf(pdfBuffer) + try { + const doc = await this.readPdf(pdfBuffer) - // File page count must be less than 600 pages - if (doc.numPages >= 600) { - throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 600 pages`) - } - // File size must be less than 200MB - if (pdfBuffer.length >= 200 * 1024 * 1024) { - const fileSizeMB = Math.round(pdfBuffer.length / (1024 * 1024)) - throw new Error(`PDF file size (${fileSizeMB}MB) exceeds the limit of 200MB`) + // File page count must be less than 600 pages + if (doc.numPages >= 600) { + throw new Error(`PDF page count (${doc.numPages}) exceeds the limit of 600 pages`) + } + + logger.info(`PDF validation passed: ${doc.numPages} pages, ${Math.round(fileSizeBytes / (1024 * 1024))}MB`) + } catch (error: any) { + // 如果是页数超限错误,直接抛出 + if (error.message.includes('exceeds the limit')) { + throw error + } + + // PDF 解析失败,记录详细警告但允许继续处理 + logger.warn( + `Failed to parse PDF structure (file may be corrupted or use non-standard format). ` + + `Skipping page count validation. Will attempt to process with MinerU API. ` + + `Error details: ${error.message}. ` + + `Suggestion: If processing fails, try repairing the PDF using tools like Adobe Acrobat or online PDF repair services.` + ) + // 不抛出错误,允许继续处理 } } @@ -139,7 +164,7 @@ export default class OpenMineruPreprocessProvider extends BasePreprocessProvider ...(this.provider.apiKey ? { Authorization: `Bearer ${this.provider.apiKey}` } : {}), ...formData.getHeaders() }, - body: formData.getBuffer() + body: new Uint8Array(formData.getBuffer()) }) if (!response.ok) {