refactor: update pdfjs import and improve file filtering in electron-builder configuration (#8198)

* refactor: update pdfjs import and improve file filtering in electron-builder configuration

- Changed pdfjs import to a direct import from 'pdfjs-dist' for consistency across modules.
- Updated file filtering in electron-builder.yml to exclude all legacy files from pdfjs-dist instead of just specific ones.

* format code
This commit is contained in:
beyondkmp 2025-07-21 10:26:07 +08:00 committed by GitHub
parent 7b58883d33
commit 47a0dbf87a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 13 additions and 13 deletions

View File

@ -54,7 +54,7 @@ files:
- '!node_modules/mammoth/{mammoth.browser.js,mammoth.browser.min.js}'
- '!node_modules/selection-hook/prebuilds/**/*' # we rebuild .node, don't use prebuilds
- '!node_modules/pdfjs-dist/web/**/*'
- '!node_modules/pdfjs-dist/legacy/web/*'
- '!node_modules/pdfjs-dist/legacy/**/*'
- '!node_modules/selection-hook/node_modules' # we don't need what in the node_modules dir
- '!node_modules/selection-hook/src' # we don't need source files
- '!**/*.{h,iobj,ipdb,tlog,recipe,vcxproj,vcxproj.filters,Makefile,*.Makefile}' # filter .node build files

View File

@ -67,18 +67,12 @@
"prepare": "git config blame.ignoreRevsFile .git-blame-ignore-revs && husky"
},
"dependencies": {
"@aws-sdk/client-s3": "^3.840.0",
"@cherrystudio/pdf-to-img-napi": "^0.0.1",
"@libsql/client": "0.14.0",
"@libsql/win32-x64-msvc": "^0.4.7",
"@strongtz/win32-arm64-msvc": "^0.4.7",
"iconv-lite": "^0.6.3",
"jaison": "^2.0.2",
"jschardet": "^3.1.4",
"jsdom": "26.1.0",
"macos-release": "^3.4.0",
"node-stream-zip": "^1.15.0",
"notion-helper": "^1.3.22",
"os-proxy-config": "^1.1.2",
"pdfjs-dist": "4.10.38",
"react-json-view": "^1.21.3",
@ -91,6 +85,7 @@
"@agentic/tavily": "^7.3.3",
"@ant-design/v5-patch-for-react-19": "^1.0.3",
"@anthropic-ai/sdk": "^0.41.0",
"@aws-sdk/client-s3": "^3.840.0",
"@cherrystudio/embedjs": "^0.1.31",
"@cherrystudio/embedjs-libsql": "^0.1.31",
"@cherrystudio/embedjs-loader-csv": "^0.1.31",
@ -199,15 +194,20 @@
"html-to-image": "^1.11.13",
"husky": "^9.1.7",
"i18next": "^23.11.5",
"iconv-lite": "^0.6.3",
"jaison": "^2.0.2",
"jest-styled-components": "^7.2.0",
"jschardet": "^3.1.4",
"lint-staged": "^15.5.0",
"lodash": "^4.17.21",
"lru-cache": "^11.1.0",
"lucide-react": "^0.487.0",
"macos-release": "^3.4.0",
"markdown-it": "^14.1.0",
"mermaid": "^11.7.0",
"mime": "^4.0.4",
"motion": "^12.10.5",
"notion-helper": "^1.3.22",
"npx-scope-finder": "^1.2.0",
"officeparser": "^4.2.0",
"openai": "patch:openai@npm%3A5.1.0#~/.yarn/patches/openai-npm-5.1.0-0e7b3ccb07.patch",

View File

@ -5,6 +5,7 @@ import { windowService } from '@main/services/WindowService'
import { getFileExt } from '@main/utils/file'
import { FileMetadata, OcrProvider } from '@types'
import { app } from 'electron'
import pdfjs from 'pdfjs-dist'
import { TypedArray } from 'pdfjs-dist/types/src/display/api'
export default abstract class BaseOcrProvider {
@ -76,8 +77,7 @@ export default abstract class BaseOcrProvider {
source: string | URL | TypedArray,
passwordCallback?: (fn: (password: string) => void, reason: string) => string
) {
const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs')
const documentLoadingTask = getDocument(source)
const documentLoadingTask = pdfjs.getDocument(source)
if (passwordCallback) {
documentLoadingTask.onPassword = passwordCallback
}

View File

@ -5,6 +5,7 @@ import { windowService } from '@main/services/WindowService'
import { getFileExt } from '@main/utils/file'
import { FileMetadata, PreprocessProvider } from '@types'
import { app } from 'electron'
import pdfjs from 'pdfjs-dist'
import { TypedArray } from 'pdfjs-dist/types/src/display/api'
export default abstract class BasePreprocessProvider {
@ -80,8 +81,7 @@ export default abstract class BasePreprocessProvider {
source: string | URL | TypedArray,
passwordCallback?: (fn: (password: string) => void, reason: string) => string
) {
const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs')
const documentLoadingTask = getDocument(source)
const documentLoadingTask = pdfjs.getDocument(source)
if (passwordCallback) {
documentLoadingTask.onPassword = passwordCallback
}

View File

@ -15,8 +15,8 @@ import * as fs from 'fs'
import { writeFileSync } from 'fs'
import { readFile } from 'fs/promises'
import officeParser from 'officeparser'
import { getDocument } from 'officeparser/pdfjs-dist-build/pdf.js'
import * as path from 'path'
import pdfjs from 'pdfjs-dist'
import { chdir } from 'process'
import { v4 as uuidv4 } from 'uuid'
import WordExtractor from 'word-extractor'
@ -367,7 +367,7 @@ class FileStorage {
const filePath = path.join(this.storageDir, id)
const buffer = await fs.promises.readFile(filePath)
const doc = await getDocument({ data: buffer }).promise
const doc = await pdfjs.getDocument({ data: buffer }).promise
const pages = doc.numPages
await doc.destroy()
return pages