cherry-studio/src/main/services/PDFService.ts
1600822305 b5fd228063 1
2025-04-30 17:59:46 +08:00

424 lines
15 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { getFileType } from '@main/utils/file'
import { FileType } from '@types'
import { app } from 'electron'
import logger from 'electron-log'
import * as fs from 'fs'
import * as path from 'path'
import { PDFDocument } from 'pdf-lib'
import { v4 as uuidv4 } from 'uuid'
export class PDFService {
// 使用方法而不是静态属性来获取目录路径
private static getTempDir(): string {
return path.join(app.getPath('temp'), 'CherryStudio')
}
private static getStorageDir(): string {
return path.join(app.getPath('userData'), 'files')
}
// 引入需要的模块
private static docx: any
private static pdfParse: any
// 懒加载模块
private static async loadModules() {
if (!this.docx) {
try {
this.docx = require('docx')
} catch (error) {
logger.error('[PDFService] Error loading docx module:', error)
throw new Error('无法加载docx模块请确保已安装')
}
}
if (!this.pdfParse) {
try {
this.pdfParse = require('pdf-parse')
} catch (error) {
logger.error('[PDFService] Error loading pdf-parse module:', error)
throw new Error('无法加载pdf-parse模块请确保已安装')
}
}
}
/**
* 获取PDF文件的页数
* @param _ Electron IPC事件
* @param filePath PDF文件路径
* @returns PDF文件的页数
*/
static async getPDFPageCount(_: Electron.IpcMainInvokeEvent, filePath: string): Promise<number> {
try {
logger.info(`[PDFService] Getting page count for PDF: ${filePath}`)
const pdfBytes = fs.readFileSync(filePath)
const pdfDoc = await PDFDocument.load(pdfBytes)
const pageCount = pdfDoc.getPageCount()
logger.info(`[PDFService] PDF page count: ${pageCount}`)
return pageCount
} catch (error) {
logger.error('[PDFService] Error getting PDF page count:', error)
throw error
}
}
/**
* 分割PDF文件
* @param _ Electron IPC事件
* @param file 原始PDF文件
* @param pageRange 页码范围例如1-5,8,10-15
* @returns 分割后的PDF文件信息
*/
static async splitPDF(_: Electron.IpcMainInvokeEvent, file: FileType, pageRange: string): Promise<FileType> {
try {
logger.info(`[PDFService] Splitting PDF: ${file.path}, page range: ${pageRange}`)
logger.info(`[PDFService] File details:`, JSON.stringify(file))
// 确保临时目录存在
const tempDir = PDFService.getTempDir()
if (!fs.existsSync(tempDir)) {
logger.info(`[PDFService] Creating temp directory: ${tempDir}`)
fs.mkdirSync(tempDir, { recursive: true })
}
// 确保存储目录存在
const storageDir = PDFService.getStorageDir()
if (!fs.existsSync(storageDir)) {
logger.info(`[PDFService] Creating storage directory: ${storageDir}`)
fs.mkdirSync(storageDir, { recursive: true })
}
// 读取原始PDF文件
logger.info(`[PDFService] Reading PDF file: ${file.path}`)
const pdfBytes = fs.readFileSync(file.path)
logger.info(`[PDFService] PDF file read, size: ${pdfBytes.length} bytes`)
const pdfDoc = await PDFDocument.load(pdfBytes)
logger.info(`[PDFService] PDF document loaded, page count: ${pdfDoc.getPageCount()}`)
// 创建新的PDF文档
const newPdfDoc = await PDFDocument.create()
logger.info(`[PDFService] New PDF document created`)
// 解析页码范围
const pageIndexes = this.parsePageRange(pageRange, pdfDoc.getPageCount())
logger.info(`[PDFService] Page range parsed, indexes: ${pageIndexes.join(', ')}`)
// 复制指定页面到新文档
const copiedPages = await newPdfDoc.copyPages(pdfDoc, pageIndexes)
logger.info(`[PDFService] Pages copied, count: ${copiedPages.length}`)
copiedPages.forEach((page, index) => {
logger.info(`[PDFService] Adding page ${index + 1} to new document`)
newPdfDoc.addPage(page)
})
// 保存新文档
logger.info(`[PDFService] Saving new PDF document`)
const newPdfBytes = await newPdfDoc.save()
logger.info(`[PDFService] New PDF document saved, size: ${newPdfBytes.length} bytes`)
// 生成新文件ID和路径
const uuid = uuidv4()
const ext = '.pdf'
// 使用之前已经声明的storageDir变量
const destPath = path.join(storageDir, uuid + ext)
logger.info(`[PDFService] Destination path: ${destPath}`)
// 写入新文件
logger.info(`[PDFService] Writing new PDF file`)
fs.writeFileSync(destPath, newPdfBytes)
logger.info(`[PDFService] New PDF file written`)
// 获取文件状态
const stats = fs.statSync(destPath)
logger.info(`[PDFService] File stats: size=${stats.size}, created=${stats.birthtime}`)
// 创建新文件信息
const newFile: FileType = {
id: uuid,
origin_name: `${path.basename(file.origin_name, '.pdf')}_pages_${pageRange}.pdf`,
name: uuid + ext,
path: destPath,
created_at: stats.birthtime.toISOString(),
size: stats.size,
ext: ext,
type: getFileType(ext),
count: 1,
pdf_page_range: pageRange
}
logger.info(`[PDFService] PDF split successful: ${newFile.path}`)
logger.info(`[PDFService] New file details:`, JSON.stringify(newFile))
return newFile
} catch (error) {
logger.error('[PDFService] Error splitting PDF:', error)
throw error
}
}
/**
* 解析页码范围字符串为页码索引数组
* @param pageRange 页码范围字符串例如1-5,8,10-15
* @param totalPages PDF文档总页数
* @returns 页码索引数组从0开始
*/
private static parsePageRange(pageRange: string, totalPages: number): number[] {
logger.info(`[PDFService] Parsing page range: ${pageRange}, total pages: ${totalPages}`)
const pageIndexes: number[] = []
const parts = pageRange.split(',')
logger.info(`[PDFService] Page range parts: ${JSON.stringify(parts)}`)
try {
for (const part of parts) {
const trimmed = part.trim()
if (!trimmed) {
logger.info(`[PDFService] Empty part, skipping`)
continue
}
logger.info(`[PDFService] Processing part: ${trimmed}`)
if (trimmed.includes('-')) {
const [startStr, endStr] = trimmed.split('-')
const start = parseInt(startStr.trim())
const end = parseInt(endStr.trim())
logger.info(`[PDFService] Range part: ${trimmed}, start: ${start}, end: ${end}`)
if (isNaN(start) || isNaN(end)) {
logger.error(`[PDFService] Invalid range part (NaN): ${trimmed}`)
continue
}
if (start < 1 || end > totalPages || start > end) {
logger.warn(`[PDFService] Invalid range: ${start}-${end}, totalPages: ${totalPages}`)
continue
}
for (let i = start; i <= end; i++) {
pageIndexes.push(i - 1) // PDF页码从0开始但用户输入从1开始
logger.info(`[PDFService] Added page index: ${i - 1} (page ${i})`)
}
} else {
const page = parseInt(trimmed)
logger.info(`[PDFService] Single page: ${page}`)
if (isNaN(page)) {
logger.error(`[PDFService] Invalid page number (NaN): ${trimmed}`)
continue
}
if (page < 1 || page > totalPages) {
logger.warn(`[PDFService] Page ${page} out of range, totalPages: ${totalPages}`)
continue
}
pageIndexes.push(page - 1) // PDF页码从0开始但用户输入从1开始
logger.info(`[PDFService] Added page index: ${page - 1} (page ${page})`)
}
}
// 去重并排序
const result = [...new Set(pageIndexes)].sort((a, b) => a - b)
logger.info(`[PDFService] Final page indexes: ${result.join(', ')}`)
return result
} catch (error) {
logger.error(`[PDFService] Error parsing page range: ${error}`)
// 如果解析出错,返回空数组
return []
}
}
/**
* 将PDF转换为Word文档
* @param _ Electron IPC事件
* @param params 包含PDF缓冲区和输出路径的参数
* @returns 转换结果
*/
static async toWord(_: Electron.IpcMainInvokeEvent, params: { pdfBuffer: ArrayBuffer, outputPath?: string }): Promise<{ success: boolean, path?: string, error?: string }> {
try {
logger.info('[PDFService] Starting PDF to Word conversion')
// 加载必要的模块
await this.loadModules()
const { pdfBuffer, outputPath } = params
// 将ArrayBuffer转换为Buffer
const buffer = Buffer.from(pdfBuffer)
// 解析PDF文本
logger.info('[PDFService] Parsing PDF text')
// 使用更高级的选项来提取文本
const pdfData = await this.pdfParse(buffer, {
// 使用自定义的渲染器来提取文本
pagerender: function(pageData) {
// 检查页面数据是否有效
if (!pageData || !pageData.getTextContent) {
logger.warn('[PDFService] Invalid page data or getTextContent method not available');
return Promise.resolve('');
}
// 记录页面信息
logger.info(`[PDFService] Processing page ${pageData.pageIndex + 1}`);
return pageData.getTextContent({
// 启用更多选项以获取更好的文本提取结果
normalizeWhitespace: true,
disableCombineTextItems: false
}).then(function(textContent: { items: Array<{ str?: string, transform?: number[], width?: number }> }) {
// 记录文本内容项目数
logger.info(`[PDFService] Page ${pageData.pageIndex + 1} has ${textContent.items.length} text items`);
if (textContent.items.length === 0) {
logger.warn(`[PDFService] No text items found on page ${pageData.pageIndex + 1}`);
return '';
}
let lastY: number | null = null;
let lastX: number | null = null;
let text = '';
// 处理文本内容
for (const item of textContent.items) {
// 确保 item 有 str 属性且不为空
if ('str' in item && item.str && item.str.trim().length > 0 && item.transform && item.transform.length >= 6) {
const x = item.transform[4];
const y = item.transform[5];
// 根据Y坐标判断是否是新行
if (lastY !== null && Math.abs(lastY - y) > 5) {
text += '\n';
lastX = null; // 重置X坐标
}
// 根据X坐标判断是否需要添加空格
else if (lastX !== null && text.length > 0 &&
text[text.length - 1] !== ' ' &&
item.str[0] !== ' ' &&
x - lastX > 10) { // 如果X坐标差距较大添加空格
text += ' ';
}
text += item.str;
lastY = y;
lastX = x + (item.width || 0);
}
}
// 记录提取的文本长度
logger.info(`[PDFService] Extracted ${text.length} characters from page ${pageData.pageIndex + 1}`);
return text;
}).catch(function(error: Error) {
logger.error(`[PDFService] Error extracting text from page ${pageData.pageIndex + 1}:`, error);
return '';
});
}
});
// 获取提取的文本
const pdfText = pdfData.text;
// 记录提取的文本长度
logger.info(`[PDFService] Extracted text length: ${pdfText.length}`);
// 如果文本太短,可能是提取失败
if (pdfText.length < 10) {
logger.warn(`[PDFService] Extracted text is too short: "${pdfText}"`);
} else {
// 记录提取的文本的前100个字符用于调试
logger.info(`[PDFService] First 100 characters of extracted text: "${pdfText.substring(0, 100).replace(/\n/g, '\\n')}..."`);
}
// 记录PDF的元数据
if (pdfData.info) {
logger.info(`[PDFService] PDF metadata:`, JSON.stringify(pdfData.info));
}
// 记录PDF的页数
logger.info(`[PDFService] PDF page count: ${pdfData.numpages}`);
// 记录PDF的版本
if (pdfData.pdfInfo && pdfData.pdfInfo.version) {
logger.info(`[PDFService] PDF version: ${pdfData.pdfInfo.version}`);
}
// 创建Word文档
logger.info('[PDFService] Creating Word document')
const { Document, Packer, Paragraph, TextRun } = this.docx
// 将PDF文本分割成段落使用更智能的分割方法
const paragraphs = pdfText
.split(/\n{2,}/) // 使用两个或更多换行符分割段落
.map((p: string) => p.replace(/\n/g, ' ').trim()) // 将段落内的换行符替换为空格
.filter((p: string) => p.length > 0); // 过滤掉空段落
// 创建Word文档对象
const doc = new Document({
sections: [
{
properties: {},
children: paragraphs.map((text: string) =>
new Paragraph({
children: [
new TextRun({
text: text.trim(),
size: 24 // 12pt
})
]
})
)
}
]
})
// 生成Word文档
logger.info('[PDFService] Generating Word document buffer')
const docBuffer = await Packer.toBuffer(doc)
// 确定输出路径
let finalOutputPath = outputPath
// 如果没有提供输出路径,则使用对话框让用户选择保存位置
if (!finalOutputPath) {
const { dialog } = require('electron')
const result = await dialog.showSaveDialog({
title: '保存Word文档',
defaultPath: 'converted-document.docx',
filters: [{ name: 'Word文档', extensions: ['docx'] }]
})
if (result.canceled) {
logger.info('[PDFService] User canceled save dialog')
return { success: false, error: '用户取消了保存' }
}
finalOutputPath = result.filePath
}
// 保存Word文档
if (!finalOutputPath) {
logger.error('[PDFService] Output path is undefined')
return {
success: false,
error: '输出路径未定义'
}
}
logger.info(`[PDFService] Saving Word document to: ${finalOutputPath}`)
fs.writeFileSync(finalOutputPath, docBuffer)
logger.info('[PDFService] PDF to Word conversion completed successfully')
return {
success: true,
path: finalOutputPath
}
} catch (error) {
logger.error('[PDFService] Error converting PDF to Word:', error)
return {
success: false,
error: error instanceof Error ? error.message : String(error)
}
}
}
}