feat: Add PDF file support for OpenAI vision models (#7217)

* feat: add base64 PDF support for OpenAI vision models

Signed-off-by: MurphyLo <1335758958@qq.com>

* sort imports in OpenAIResponseAPIClient.ts

* sort imports in OpenAIResponseAPIClient.ts

* remove pdf-parse

* modify pdfPageCount implementation to use officeparser built-in pdf.js

* chore: update yarn.lock to remove pdf-parse dependency

---------

Signed-off-by: MurphyLo <1335758958@qq.com>
Co-authored-by: suyao <sy20010504@gmail.com>
This commit is contained in:
Murphy 2025-06-16 11:09:51 +08:00 committed by GitHub
parent b6b1b43094
commit 00e395f252
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 41 additions and 0 deletions

View File

@ -118,6 +118,7 @@ export enum IpcChannel {
File_Copy = 'file:copy',
File_BinaryImage = 'file:binaryImage',
File_Base64File = 'file:base64File',
File_GetPdfInfo = 'file:getPdfInfo',
Fs_Read = 'fs:read',
Export_Word = 'export:word',

View File

@ -226,6 +226,7 @@ export function registerIpc(mainWindow: BrowserWindow, app: Electron.App) {
ipcMain.handle(IpcChannel.File_Base64Image, fileManager.base64Image)
ipcMain.handle(IpcChannel.File_SaveBase64Image, fileManager.saveBase64Image)
ipcMain.handle(IpcChannel.File_Base64File, fileManager.base64File)
ipcMain.handle(IpcChannel.File_GetPdfInfo, fileManager.pdfPageCount)
ipcMain.handle(IpcChannel.File_Download, fileManager.downloadFile)
ipcMain.handle(IpcChannel.File_Copy, fileManager.copyFile)
ipcMain.handle(IpcChannel.File_BinaryImage, fileManager.binaryImage)

View File

@ -15,6 +15,7 @@ import * as fs from 'fs'
import { writeFileSync } from 'fs'
import { readFile } from 'fs/promises'
import officeParser from 'officeparser'
import { getDocument } from 'officeparser/pdfjs-dist-build/pdf.js'
import * as path from 'path'
import { chdir } from 'process'
import { v4 as uuidv4 } from 'uuid'
@ -321,6 +322,16 @@ class FileStorage {
return { data: base64, mime }
}
public pdfPageCount = async (_: Electron.IpcMainInvokeEvent, id: string): Promise<number> => {
const filePath = path.join(this.storageDir, id)
const buffer = await fs.promises.readFile(filePath)
const doc = await getDocument({ data: buffer }).promise
const pages = doc.numPages
await doc.destroy()
return pages
}
public binaryImage = async (_: Electron.IpcMainInvokeEvent, id: string): Promise<{ data: Buffer; mime: string }> => {
const filePath = path.join(this.storageDir, id)
const data = await fs.promises.readFile(filePath)

View File

@ -83,6 +83,7 @@ const api = {
copy: (fileId: string, destPath: string) => ipcRenderer.invoke(IpcChannel.File_Copy, fileId, destPath),
binaryImage: (fileId: string) => ipcRenderer.invoke(IpcChannel.File_BinaryImage, fileId),
base64File: (fileId: string) => ipcRenderer.invoke(IpcChannel.File_Base64File, fileId),
pdfInfo: (fileId: string) => ipcRenderer.invoke(IpcChannel.File_GetPdfInfo, fileId),
getPathForFile: (file: File) => webUtils.getPathForFile(file)
},
fs: {

View File

@ -6,6 +6,7 @@ import {
} from '@renderer/config/models'
import { estimateTextTokens } from '@renderer/services/TokenService'
import {
FileType,
FileTypes,
MCPCallToolResponse,
MCPTool,
@ -34,6 +35,7 @@ import {
} from '@renderer/utils/mcp-tools'
import { findFileBlocks, findImageBlocks } from '@renderer/utils/messageUtils/find'
import { buildSystemPrompt } from '@renderer/utils/prompt'
import { MB } from '@shared/config/constant'
import { isEmpty } from 'lodash'
import OpenAI from 'openai'
@ -90,6 +92,23 @@ export class OpenAIResponseAPIClient extends OpenAIBaseClient<
return await sdk.responses.create(payload, options)
}
private async handlePdfFile(file: FileType): Promise<OpenAI.Responses.ResponseInputFile | undefined> {
if (file.size > 32 * MB) return undefined
try {
const pageCount = await window.api.file.pdfInfo(file.id + file.ext)
if (pageCount > 100) return undefined
} catch {
return undefined
}
const { data } = await window.api.file.base64File(file.id + file.ext)
return {
type: 'input_file',
filename: file.origin_name,
file_data: `data:application/pdf;base64,${data}`
} as OpenAI.Responses.ResponseInputFile
}
public async convertMessageToSdkParam(message: Message, model: Model): Promise<OpenAIResponseSdkMessageParam> {
const isVision = isVisionModel(model)
const content = await this.getMessageContent(message)
@ -141,6 +160,14 @@ export class OpenAIResponseAPIClient extends OpenAIBaseClient<
const file = fileBlock.file
if (!file) continue
if (isVision && file.ext === '.pdf') {
const pdfPart = await this.handlePdfFile(file)
if (pdfPart) {
parts.push(pdfPart)
continue
}
}
if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) {
const fileContent = (await window.api.file.read(file.id + file.ext)).trim()
parts.push({