feat(FileStorage): add support for .doc files using word-extractor (#7374)

* feat(FileStorage): add support for .doc files and integrate word-extractor

* chore(package): add word-extractor to devdependencies
This commit is contained in:
Tristan Zhang 2025-06-23 08:55:03 +08:00 committed by GitHub
parent 2350919f36
commit a8e23966fa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 47 additions and 2 deletions

View File

@ -124,6 +124,7 @@
"@types/react-infinite-scroll-component": "^5.0.0",
"@types/react-window": "^1",
"@types/tinycolor2": "^1",
"@types/word-extractor": "^1",
"@uiw/codemirror-extensions-langs": "^4.23.12",
"@uiw/codemirror-themes-all": "^4.23.12",
"@uiw/react-codemirror": "^4.23.12",
@ -218,6 +219,7 @@
"vite": "6.2.6",
"vitest": "^3.1.4",
"webdav": "^5.8.0",
"word-extractor": "^1.0.4",
"zipread": "^1.3.3"
},
"resolutions": {

View File

@ -1,7 +1,7 @@
export const imageExts = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']
export const videoExts = ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv']
export const audioExts = ['.mp3', '.wav', '.ogg', '.flac', '.aac']
export const documentExts = ['.pdf', '.docx', '.pptx', '.xlsx', '.odt', '.odp', '.ods']
export const documentExts = ['.pdf', '.doc', '.docx', '.pptx', '.xlsx', '.odt', '.odp', '.ods']
export const thirdPartyApplicationExts = ['.draftsExport']
export const bookExts = ['.epub']
const textExtsByCategory = new Map([

View File

@ -16,6 +16,7 @@ const FILE_LOADER_MAP: Record<string, string> = {
// 内置类型
'.pdf': 'common',
'.csv': 'common',
'.doc': 'common',
'.docx': 'common',
'.pptx': 'common',
'.xlsx': 'common',

View File

@ -220,10 +220,21 @@ class FileStorage {
public readFile = async (_: Electron.IpcMainInvokeEvent, id: string): Promise<string> => {
const filePath = path.join(this.storageDir, id)
if (documentExts.includes(path.extname(filePath))) {
const fileExtension = path.extname(filePath)
if (documentExts.includes(fileExtension)) {
const originalCwd = process.cwd()
try {
chdir(this.tempDir)
if (fileExtension === '.doc') {
const WordExtractor = require('word-extractor')
const extractor = new WordExtractor()
const extracted = await extractor.extract(filePath)
chdir(originalCwd)
return extracted.getBody()
}
const data = await officeParser.parseOfficeAsync(filePath)
chdir(originalCwd)
return data

View File

@ -92,6 +92,7 @@ describe('file', () => {
it('should return DOCUMENT for document extensions', () => {
expect(getFileType('.pdf')).toBe(FileTypes.DOCUMENT)
expect(getFileType('.pptx')).toBe(FileTypes.DOCUMENT)
expect(getFileType('.doc')).toBe(FileTypes.DOCUMENT)
expect(getFileType('.docx')).toBe(FileTypes.DOCUMENT)
expect(getFileType('.xlsx')).toBe(FileTypes.DOCUMENT)
expect(getFileType('.odt')).toBe(FileTypes.DOCUMENT)

View File

@ -4754,6 +4754,15 @@ __metadata:
languageName: node
linkType: hard
"@types/word-extractor@npm:^1":
version: 1.0.6
resolution: "@types/word-extractor@npm:1.0.6"
dependencies:
"@types/node": "npm:*"
checksum: 10c0/84f89c458213db5aec4d6badad14e0f2c07ac4b92f16165d19a95548f2b98fd5fff00419d49547464cb75c9432b5e9cb3b452d75eb5f07d808e31b44be390453
languageName: node
linkType: hard
"@types/ws@npm:^8.5.4":
version: 8.18.1
resolution: "@types/ws@npm:8.18.1"
@ -5642,6 +5651,7 @@ __metadata:
"@types/react-infinite-scroll-component": "npm:^5.0.0"
"@types/react-window": "npm:^1"
"@types/tinycolor2": "npm:^1"
"@types/word-extractor": "npm:^1"
"@uiw/codemirror-extensions-langs": "npm:^4.23.12"
"@uiw/codemirror-themes-all": "npm:^4.23.12"
"@uiw/react-codemirror": "npm:^4.23.12"
@ -5742,6 +5752,7 @@ __metadata:
vite: "npm:6.2.6"
vitest: "npm:^3.1.4"
webdav: "npm:^5.8.0"
word-extractor: "npm:^1.0.4"
zipread: "npm:^1.3.3"
languageName: unknown
linkType: soft
@ -16428,6 +16439,15 @@ __metadata:
languageName: node
linkType: hard
"saxes@npm:^5.0.1":
version: 5.0.1
resolution: "saxes@npm:5.0.1"
dependencies:
xmlchars: "npm:^2.2.0"
checksum: 10c0/b7476c41dbe1c3a89907d2546fecfba234de5e66743ef914cde2603f47b19bed09732ab51b528ad0f98b958369d8be72b6f5af5c9cfad69972a73d061f0b3952
languageName: node
linkType: hard
"saxes@npm:^6.0.0":
version: 6.0.0
resolution: "saxes@npm:6.0.0"
@ -18632,6 +18652,16 @@ __metadata:
languageName: node
linkType: hard
"word-extractor@npm:^1.0.4":
version: 1.0.4
resolution: "word-extractor@npm:1.0.4"
dependencies:
saxes: "npm:^5.0.1"
yauzl: "npm:^2.10.0"
checksum: 10c0/f8c6b4f9278802d0c803479c1441713e351e67f7b0d2f85bd8cbe94b76298d4adb058b5f23ee0a01faa02f3b1f01c507a4a2f44fa39cfcbd498a51769dd9e8e7
languageName: node
linkType: hard
"word-wrap@npm:^1.2.5":
version: 1.2.5
resolution: "word-wrap@npm:1.2.5"