diff --git a/package.json b/package.json index 06e656cbc1..7c193730d0 100644 --- a/package.json +++ b/package.json @@ -63,6 +63,8 @@ "@libsql/client": "0.14.0", "@libsql/win32-x64-msvc": "^0.4.7", "@strongtz/win32-arm64-msvc": "^0.4.7", + "iconv-lite": "^0.6.3", + "jschardet": "^3.1.4", "jsdom": "26.1.0", "macos-release": "^3.4.0", "node-stream-zip": "^1.15.0", diff --git a/src/main/knowledage/loader/index.ts b/src/main/knowledage/loader/index.ts index 783e62881a..f86df65dba 100644 --- a/src/main/knowledage/loader/index.ts +++ b/src/main/knowledage/loader/index.ts @@ -1,8 +1,7 @@ -import * as fs from 'node:fs' - import { JsonLoader, LocalPathLoader, RAGApplication, TextLoader } from '@cherrystudio/embedjs' import type { AddLoaderReturn } from '@cherrystudio/embedjs-interfaces' import { WebLoader } from '@cherrystudio/embedjs-loader-web' +import { readTextFileWithAutoEncoding } from '@main/utils/file' import { LoaderReturn } from '@shared/config/types' import { FileMetadata, KnowledgeBaseParams } from '@types' import Logger from 'electron-log' @@ -115,7 +114,7 @@ export async function addFileLoader( // HTML类型处理 loaderReturn = await ragApplication.addLoader( new WebLoader({ - urlOrContent: fs.readFileSync(file.path, 'utf-8'), + urlOrContent: readTextFileWithAutoEncoding(file.path), chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, @@ -125,7 +124,7 @@ export async function addFileLoader( case 'json': try { - jsonObject = JSON.parse(fs.readFileSync(file.path, 'utf-8')) + jsonObject = JSON.parse(readTextFileWithAutoEncoding(file.path)) } catch (error) { jsonParsed = false Logger.warn('[KnowledgeBase] failed parsing json file, falling back to text processing:', file.path, error) @@ -141,7 +140,7 @@ export async function addFileLoader( // 如果是其他文本类型且尚未读取文件,则读取文件 loaderReturn = await ragApplication.addLoader( new TextLoader({ - text: fs.readFileSync(file.path, 'utf-8'), + text: readTextFileWithAutoEncoding(file.path), chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, diff --git a/src/main/services/FileStorage.ts b/src/main/services/FileStorage.ts index 0bdcdf56f5..baa94f535a 100644 --- a/src/main/services/FileStorage.ts +++ b/src/main/services/FileStorage.ts @@ -1,4 +1,4 @@ -import { getFilesDir, getFileType, getTempDir } from '@main/utils/file' +import { getFilesDir, getFileType, getTempDir, readTextFileWithAutoEncoding } from '@main/utils/file' import { documentExts, imageExts, MB } from '@shared/config/constant' import { FileMetadata } from '@types' import * as crypto from 'crypto' @@ -188,6 +188,8 @@ class FileStorage { count: 1 } + logger.info('[FileStorage] File uploaded:', fileMetadata) + return fileMetadata } @@ -256,7 +258,13 @@ class FileStorage { } } - return fs.readFileSync(filePath, 'utf8') + try { + const result = readTextFileWithAutoEncoding(filePath) + return result + } catch (error) { + logger.error(error) + return 'failed to read file' + } } public createTempFile = async (_: Electron.IpcMainInvokeEvent, fileName: string): Promise => { diff --git a/src/main/utils/__tests__/file.test.ts b/src/main/utils/__tests__/file.test.ts index 14f4801524..6066729dc7 100644 --- a/src/main/utils/__tests__/file.test.ts +++ b/src/main/utils/__tests__/file.test.ts @@ -3,8 +3,10 @@ import os from 'node:os' import path from 'node:path' import { FileTypes } from '@types' +import iconv from 'iconv-lite' import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' +import { detectEncoding, readTextFileWithAutoEncoding } from '../file' import { getAllFiles, getAppConfigDir, getConfigDir, getFilesDir, getFileType, getTempDir } from '../file' // Mock dependencies @@ -241,4 +243,104 @@ describe('file', () => { expect(appConfigDir).toBe('/mock/home/.cherrystudio/config/') }) }) + + // 在 describe('file') 块内部添加新的 describe 块 + describe('detectEncoding', () => { + const mockFilePath = '/path/to/mock/file.txt' + + beforeEach(() => { + vi.mocked(fs.openSync).mockReturnValue(123) + vi.mocked(fs.closeSync).mockImplementation(() => {}) + }) + + it('should correctly detect UTF-8 encoding', () => { + // 准备UTF-8编码的Buffer + const content = '这是UTF-8测试内容' + const buffer = Buffer.from(content, 'utf-8') + + // 模拟文件读取 + vi.mocked(fs.readSync).mockImplementation((_, buf) => { + const targetBuffer = new Uint8Array(buf.buffer) + const sourceBuffer = new Uint8Array(buffer) + targetBuffer.set(sourceBuffer) + return 1024 + }) + + const encoding = detectEncoding(mockFilePath) + expect(encoding).toBe('UTF-8') + }) + + it('should correctly detect GB2312 encoding', () => { + // 使用iconv创建GB2312编码内容 + const content = '这是一段GB2312编码的测试内容' + const gb2312Buffer = iconv.encode(content, 'GB2312') + + // 模拟文件读取 + vi.mocked(fs.readSync).mockImplementation((_, buf) => { + const targetBuffer = new Uint8Array(buf.buffer) + const sourceBuffer = new Uint8Array(gb2312Buffer) + targetBuffer.set(sourceBuffer) + return gb2312Buffer.length + }) + + const encoding = detectEncoding(mockFilePath) + expect(encoding).toMatch(/GB2312|GB18030/i) + }) + + it('should correctly detect ASCII encoding', () => { + // 准备ASCII编码内容 + const content = 'ASCII content' + const buffer = Buffer.from(content, 'ascii') + + // 模拟文件读取 + vi.mocked(fs.readSync).mockImplementation((_, buf) => { + const targetBuffer = new Uint8Array(buf.buffer) + const sourceBuffer = new Uint8Array(buffer) + targetBuffer.set(sourceBuffer) + return buffer.length + }) + + const encoding = detectEncoding(mockFilePath) + expect(encoding.toLowerCase()).toBe('ascii') + }) + }) + + describe('readTextFileWithAutoEncoding', () => { + const mockFilePath = '/path/to/mock/file.txt' + + beforeEach(() => { + vi.mocked(fs.openSync).mockReturnValue(123) + vi.mocked(fs.closeSync).mockImplementation(() => {}) + }) + + it('should read file with auto encoding', () => { + const content = '这是一段GB2312编码的测试内容' + const buffer = iconv.encode(content, 'GB2312') + vi.mocked(fs.readSync).mockImplementation((_, buf) => { + const targetBuffer = new Uint8Array(buf.buffer) + const sourceBuffer = new Uint8Array(buffer) + targetBuffer.set(sourceBuffer) + return buffer.length + }) + vi.mocked(fs.readFileSync).mockReturnValue(buffer) + + const result = readTextFileWithAutoEncoding(mockFilePath) + expect(result).toBe(content) + }) + + it('should try to fix bad detected encoding', () => { + const content = '这是一段GB2312编码的测试内容' + const buffer = iconv.encode(content, 'GB2312') + vi.mocked(fs.readSync).mockImplementation((_, buf) => { + const targetBuffer = new Uint8Array(buf.buffer) + const sourceBuffer = new Uint8Array(buffer) + targetBuffer.set(sourceBuffer) + return buffer.length + }) + vi.mocked(fs.readFileSync).mockReturnValue(buffer) + vi.mocked(vi.fn(detectEncoding)).mockReturnValue('UTF-8') + const result = readTextFileWithAutoEncoding(mockFilePath) + expect(result).toBe(content) + }) + }) }) diff --git a/src/main/utils/file.ts b/src/main/utils/file.ts index 2c52e82a71..baba7ec8ba 100644 --- a/src/main/utils/file.ts +++ b/src/main/utils/file.ts @@ -6,6 +6,9 @@ import { isLinux, isPortable } from '@main/constant' import { audioExts, documentExts, imageExts, textExts, videoExts } from '@shared/config/constant' import { FileMetadata, FileTypes } from '@types' import { app } from 'electron' +import Logger from 'electron-log' +import iconv from 'iconv-lite' +import { detect as detectEncoding_, detectAll as detectEncodingAll } from 'jschardet' import { v4 as uuidv4 } from 'uuid' export function initAppDataDir() { @@ -202,3 +205,57 @@ export function getCacheDir() { export function getAppConfigDir(name: string) { return path.join(getConfigDir(), name) } + +/** + * 使用 jschardet 库检测文件编码格式 + * @param filePath - 文件路径 + * @returns 返回文件的编码格式,如 UTF-8, ascii, GB2312 等 + */ +export function detectEncoding(filePath: string): string { + // 读取文件前1KB来检测编码 + const buffer = Buffer.alloc(1024) + const fd = fs.openSync(filePath, 'r') + fs.readSync(fd, buffer, 0, 1024, 0) + fs.closeSync(fd) + const { encoding } = detectEncoding_(buffer) + return encoding +} + +/** + * 读取文件内容并自动检测编码格式进行解码 + * @param filePath - 文件路径 + * @returns 解码后的文件内容 + */ +export function readTextFileWithAutoEncoding(filePath: string) { + const encoding = detectEncoding(filePath) + const data = fs.readFileSync(filePath) + const content = iconv.decode(data, encoding) + + if (content.includes('\uFFFD') && encoding !== 'UTF-8') { + Logger.error(`文件 ${filePath} 自动识别编码为 ${encoding},但包含错误字符。尝试其他编码`) + const buffer = Buffer.alloc(1024) + const fd = fs.openSync(filePath, 'r') + fs.readSync(fd, buffer, 0, 1024, 0) + fs.closeSync(fd) + const encodings = detectEncodingAll(buffer) + if (encodings.length > 0) { + for (const item of encodings) { + if (item.encoding === encoding) { + continue + } + Logger.log(`尝试使用 ${item.encoding} 解码文件 ${filePath}`) + const content = iconv.decode(buffer, item.encoding) + if (!content.includes('\uFFFD')) { + Logger.log(`文件 ${filePath} 解码成功,编码为 ${item.encoding}`) + return content + } else { + Logger.error(`文件 ${filePath} 使用 ${item.encoding} 解码失败,尝试下一个编码`) + } + } + } + Logger.error(`文件 ${filePath} 所有可能的编码均解码失败,尝试使用 UTF-8 解码`) + return iconv.decode(buffer, 'UTF-8') + } + + return content +} diff --git a/yarn.lock b/yarn.lock index 7afa0defc2..4ff546f911 100644 --- a/yarn.lock +++ b/yarn.lock @@ -7165,7 +7165,9 @@ __metadata: html-to-image: "npm:^1.11.13" husky: "npm:^9.1.7" i18next: "npm:^23.11.5" + iconv-lite: "npm:^0.6.3" jest-styled-components: "npm:^7.2.0" + jschardet: "npm:^3.1.4" jsdom: "npm:26.1.0" lint-staged: "npm:^15.5.0" lodash: "npm:^4.17.21" @@ -13052,6 +13054,13 @@ __metadata: languageName: node linkType: hard +"jschardet@npm:^3.1.4": + version: 3.1.4 + resolution: "jschardet@npm:3.1.4" + checksum: 10c0/d72c724ff60bc185d3962617ffda6849c6d632a935820841078c656a5247d73617a5df3b233e1fb1064de8683f7dae1b422b68186d1d6db22117b59edb5433dc + languageName: node + linkType: hard + "jsdom@npm:26.1.0": version: 26.1.0 resolution: "jsdom@npm:26.1.0"