From a7b78c547ae626eb11e56d2131f1690b0ab59bb1 Mon Sep 17 00:00:00 2001 From: Phantom <59059173+EurFelux@users.noreply.github.com> Date: Thu, 10 Jul 2025 22:13:40 +0800 Subject: [PATCH] fix(encoding): encoding detection and decoding logic (#8024) --- src/main/knowledage/loader/index.ts | 6 +- src/main/services/FileStorage.ts | 13 +- src/main/utils/__tests__/file.test.ts | 125 ++++++------------ src/main/utils/file.ts | 85 ++++++------ src/preload/index.ts | 3 +- .../src/aiCore/clients/BaseApiClient.ts | 2 +- .../clients/anthropic/AnthropicAPIClient.ts | 2 +- .../aiCore/clients/gemini/GeminiAPIClient.ts | 2 +- .../aiCore/clients/openai/OpenAIApiClient.ts | 2 +- .../clients/openai/OpenAIResponseAPIClient.ts | 2 +- src/renderer/src/services/TokenService.ts | 2 +- 11 files changed, 99 insertions(+), 145 deletions(-) diff --git a/src/main/knowledage/loader/index.ts b/src/main/knowledage/loader/index.ts index f86df65dba..5fba26436e 100644 --- a/src/main/knowledage/loader/index.ts +++ b/src/main/knowledage/loader/index.ts @@ -114,7 +114,7 @@ export async function addFileLoader( // HTML类型处理 loaderReturn = await ragApplication.addLoader( new WebLoader({ - urlOrContent: readTextFileWithAutoEncoding(file.path), + urlOrContent: await readTextFileWithAutoEncoding(file.path), chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, @@ -124,7 +124,7 @@ export async function addFileLoader( case 'json': try { - jsonObject = JSON.parse(readTextFileWithAutoEncoding(file.path)) + jsonObject = JSON.parse(await readTextFileWithAutoEncoding(file.path)) } catch (error) { jsonParsed = false Logger.warn('[KnowledgeBase] failed parsing json file, falling back to text processing:', file.path, error) @@ -140,7 +140,7 @@ export async function addFileLoader( // 如果是其他文本类型且尚未读取文件,则读取文件 loaderReturn = await ragApplication.addLoader( new TextLoader({ - text: readTextFileWithAutoEncoding(file.path), + text: await readTextFileWithAutoEncoding(file.path), chunkSize: base.chunkSize, chunkOverlap: base.chunkOverlap }) as any, diff --git a/src/main/services/FileStorage.ts b/src/main/services/FileStorage.ts index 87d465e5f4..b7a80c7f0b 100644 --- a/src/main/services/FileStorage.ts +++ b/src/main/services/FileStorage.ts @@ -231,7 +231,11 @@ class FileStorage { await fs.promises.rm(path.join(this.storageDir, id), { recursive: true }) } - public readFile = async (_: Electron.IpcMainInvokeEvent, id: string): Promise => { + public readFile = async ( + _: Electron.IpcMainInvokeEvent, + id: string, + detectEncoding: boolean = false + ): Promise => { const filePath = path.join(this.storageDir, id) const fileExtension = path.extname(filePath) @@ -259,8 +263,11 @@ class FileStorage { } try { - const result = readTextFileWithAutoEncoding(filePath) - return result + if (detectEncoding) { + return readTextFileWithAutoEncoding(filePath) + } else { + return fs.readFileSync(filePath, 'utf-8') + } } catch (error) { logger.error(error) return 'failed to read file' diff --git a/src/main/utils/__tests__/file.test.ts b/src/main/utils/__tests__/file.test.ts index 6066729dc7..fbd734fd3d 100644 --- a/src/main/utils/__tests__/file.test.ts +++ b/src/main/utils/__tests__/file.test.ts @@ -1,16 +1,19 @@ import * as fs from 'node:fs' +import * as fsPromises from 'node:fs/promises' import os from 'node:os' import path from 'node:path' import { FileTypes } from '@types' import iconv from 'iconv-lite' +import { detectAll as detectEncodingAll } from 'jschardet' import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' -import { detectEncoding, readTextFileWithAutoEncoding } from '../file' +import { readTextFileWithAutoEncoding } from '../file' import { getAllFiles, getAppConfigDir, getConfigDir, getFilesDir, getFileType, getTempDir } from '../file' // Mock dependencies vi.mock('node:fs') +vi.mock('node:fs/promises') vi.mock('node:os') vi.mock('node:path') vi.mock('uuid', () => ({ @@ -244,102 +247,52 @@ describe('file', () => { }) }) - // 在 describe('file') 块内部添加新的 describe 块 - describe('detectEncoding', () => { - const mockFilePath = '/path/to/mock/file.txt' - - beforeEach(() => { - vi.mocked(fs.openSync).mockReturnValue(123) - vi.mocked(fs.closeSync).mockImplementation(() => {}) - }) - - it('should correctly detect UTF-8 encoding', () => { - // 准备UTF-8编码的Buffer - const content = '这是UTF-8测试内容' - const buffer = Buffer.from(content, 'utf-8') - - // 模拟文件读取 - vi.mocked(fs.readSync).mockImplementation((_, buf) => { - const targetBuffer = new Uint8Array(buf.buffer) - const sourceBuffer = new Uint8Array(buffer) - targetBuffer.set(sourceBuffer) - return 1024 - }) - - const encoding = detectEncoding(mockFilePath) - expect(encoding).toBe('UTF-8') - }) - - it('should correctly detect GB2312 encoding', () => { - // 使用iconv创建GB2312编码内容 - const content = '这是一段GB2312编码的测试内容' - const gb2312Buffer = iconv.encode(content, 'GB2312') - - // 模拟文件读取 - vi.mocked(fs.readSync).mockImplementation((_, buf) => { - const targetBuffer = new Uint8Array(buf.buffer) - const sourceBuffer = new Uint8Array(gb2312Buffer) - targetBuffer.set(sourceBuffer) - return gb2312Buffer.length - }) - - const encoding = detectEncoding(mockFilePath) - expect(encoding).toMatch(/GB2312|GB18030/i) - }) - - it('should correctly detect ASCII encoding', () => { - // 准备ASCII编码内容 - const content = 'ASCII content' - const buffer = Buffer.from(content, 'ascii') - - // 模拟文件读取 - vi.mocked(fs.readSync).mockImplementation((_, buf) => { - const targetBuffer = new Uint8Array(buf.buffer) - const sourceBuffer = new Uint8Array(buffer) - targetBuffer.set(sourceBuffer) - return buffer.length - }) - - const encoding = detectEncoding(mockFilePath) - expect(encoding.toLowerCase()).toBe('ascii') - }) - }) - describe('readTextFileWithAutoEncoding', () => { const mockFilePath = '/path/to/mock/file.txt' - beforeEach(() => { - vi.mocked(fs.openSync).mockReturnValue(123) - vi.mocked(fs.closeSync).mockImplementation(() => {}) - }) - - it('should read file with auto encoding', () => { + it('should read file with auto encoding', async () => { const content = '这是一段GB2312编码的测试内容' const buffer = iconv.encode(content, 'GB2312') - vi.mocked(fs.readSync).mockImplementation((_, buf) => { - const targetBuffer = new Uint8Array(buf.buffer) - const sourceBuffer = new Uint8Array(buffer) - targetBuffer.set(sourceBuffer) - return buffer.length - }) - vi.mocked(fs.readFileSync).mockReturnValue(buffer) - const result = readTextFileWithAutoEncoding(mockFilePath) + // 创建模拟的 FileHandle 对象 + const mockFileHandle = { + read: vi.fn().mockResolvedValue({ + bytesRead: buffer.byteLength, + buffer: buffer + }), + close: vi.fn().mockResolvedValue(undefined) + } + + // 模拟 open 方法 + vi.spyOn(fsPromises, 'open').mockResolvedValue(mockFileHandle as any) + vi.spyOn(fsPromises, 'readFile').mockResolvedValue(buffer) + + const result = await readTextFileWithAutoEncoding(mockFilePath) expect(result).toBe(content) }) - it('should try to fix bad detected encoding', () => { + it('should try to fix bad detected encoding', async () => { const content = '这是一段GB2312编码的测试内容' const buffer = iconv.encode(content, 'GB2312') - vi.mocked(fs.readSync).mockImplementation((_, buf) => { - const targetBuffer = new Uint8Array(buf.buffer) - const sourceBuffer = new Uint8Array(buffer) - targetBuffer.set(sourceBuffer) - return buffer.length - }) - vi.mocked(fs.readFileSync).mockReturnValue(buffer) - vi.mocked(vi.fn(detectEncoding)).mockReturnValue('UTF-8') - const result = readTextFileWithAutoEncoding(mockFilePath) + + // 创建模拟的 FileHandle 对象 + const mockFileHandle = { + read: vi.fn().mockResolvedValue({ + bytesRead: buffer.byteLength, + buffer: buffer + }), + close: vi.fn().mockResolvedValue(undefined) + } + + // 模拟 fs.open 方法 + vi.spyOn(fsPromises, 'open').mockResolvedValue(mockFileHandle as any) + vi.spyOn(fsPromises, 'readFile').mockResolvedValue(buffer) + vi.mocked(vi.fn(detectEncodingAll)).mockReturnValue([ + { encoding: 'UTF-8', confidence: 0.9 }, + { encoding: 'GB2312', confidence: 0.8 } + ]) + + const result = await readTextFileWithAutoEncoding(mockFilePath) expect(result).toBe(content) }) }) diff --git a/src/main/utils/file.ts b/src/main/utils/file.ts index baba7ec8ba..4cf8bd0e44 100644 --- a/src/main/utils/file.ts +++ b/src/main/utils/file.ts @@ -1,14 +1,15 @@ import * as fs from 'node:fs' +import { open, readFile } from 'node:fs/promises' import os from 'node:os' import path from 'node:path' import { isLinux, isPortable } from '@main/constant' -import { audioExts, documentExts, imageExts, textExts, videoExts } from '@shared/config/constant' +import { audioExts, documentExts, imageExts, MB, textExts, videoExts } from '@shared/config/constant' import { FileMetadata, FileTypes } from '@types' import { app } from 'electron' import Logger from 'electron-log' import iconv from 'iconv-lite' -import { detect as detectEncoding_, detectAll as detectEncodingAll } from 'jschardet' +import * as jschardet from 'jschardet' import { v4 as uuidv4 } from 'uuid' export function initAppDataDir() { @@ -206,56 +207,48 @@ export function getAppConfigDir(name: string) { return path.join(getConfigDir(), name) } -/** - * 使用 jschardet 库检测文件编码格式 - * @param filePath - 文件路径 - * @returns 返回文件的编码格式,如 UTF-8, ascii, GB2312 等 - */ -export function detectEncoding(filePath: string): string { - // 读取文件前1KB来检测编码 - const buffer = Buffer.alloc(1024) - const fd = fs.openSync(filePath, 'r') - fs.readSync(fd, buffer, 0, 1024, 0) - fs.closeSync(fd) - const { encoding } = detectEncoding_(buffer) - return encoding -} - /** * 读取文件内容并自动检测编码格式进行解码 * @param filePath - 文件路径 * @returns 解码后的文件内容 */ -export function readTextFileWithAutoEncoding(filePath: string) { - const encoding = detectEncoding(filePath) - const data = fs.readFileSync(filePath) - const content = iconv.decode(data, encoding) +export async function readTextFileWithAutoEncoding(filePath: string): Promise { + // 读取前1MB以检测编码 + const buffer = Buffer.alloc(1 * MB) + const fh = await open(filePath, 'r') + const { buffer: bufferRead } = await fh.read(buffer, 0, 1 * MB, 0) + await fh.close() - if (content.includes('\uFFFD') && encoding !== 'UTF-8') { - Logger.error(`文件 ${filePath} 自动识别编码为 ${encoding},但包含错误字符。尝试其他编码`) - const buffer = Buffer.alloc(1024) - const fd = fs.openSync(filePath, 'r') - fs.readSync(fd, buffer, 0, 1024, 0) - fs.closeSync(fd) - const encodings = detectEncodingAll(buffer) - if (encodings.length > 0) { - for (const item of encodings) { - if (item.encoding === encoding) { - continue - } - Logger.log(`尝试使用 ${item.encoding} 解码文件 ${filePath}`) - const content = iconv.decode(buffer, item.encoding) - if (!content.includes('\uFFFD')) { - Logger.log(`文件 ${filePath} 解码成功,编码为 ${item.encoding}`) - return content - } else { - Logger.error(`文件 ${filePath} 使用 ${item.encoding} 解码失败,尝试下一个编码`) - } - } - } - Logger.error(`文件 ${filePath} 所有可能的编码均解码失败,尝试使用 UTF-8 解码`) - return iconv.decode(buffer, 'UTF-8') + // 获取文件编码格式,最多取前两个可能的编码 + const encodings = jschardet + .detectAll(bufferRead) + .map((item) => ({ + ...item, + encoding: item.encoding === 'ascii' ? 'UTF-8' : item.encoding + })) + .filter((item, index, array) => array.findIndex((prevItem) => prevItem.encoding === item.encoding) === index) + .slice(0, 2) + + if (encodings.length === 0) { + Logger.error('Failed to detect encoding. Use utf-8 to decode.') + const data = await readFile(filePath) + return iconv.decode(data, 'UTF-8') } - return content + const data = await readFile(filePath) + + for (const item of encodings) { + const encoding = item.encoding + const content = iconv.decode(data, encoding) + if (content.includes('\uFFFD')) { + Logger.error( + `File ${filePath} was auto-detected as ${encoding} encoding, but contains invalid characters. Trying other encodings` + ) + } else { + return content + } + } + + Logger.error(`File ${filePath} failed to decode with all possible encodings, trying UTF-8 encoding`) + return iconv.decode(data, 'UTF-8') } diff --git a/src/preload/index.ts b/src/preload/index.ts index 8fc104b68e..ca88949e70 100644 --- a/src/preload/index.ts +++ b/src/preload/index.ts @@ -115,7 +115,8 @@ const api = { upload: (file: FileMetadata) => ipcRenderer.invoke(IpcChannel.File_Upload, file), delete: (fileId: string) => ipcRenderer.invoke(IpcChannel.File_Delete, fileId), deleteDir: (dirPath: string) => ipcRenderer.invoke(IpcChannel.File_DeleteDir, dirPath), - read: (fileId: string) => ipcRenderer.invoke(IpcChannel.File_Read, fileId), + read: (fileId: string, detectEncoding?: boolean) => + ipcRenderer.invoke(IpcChannel.File_Read, fileId, detectEncoding), clear: () => ipcRenderer.invoke(IpcChannel.File_Clear), get: (filePath: string) => ipcRenderer.invoke(IpcChannel.File_Get, filePath), /** diff --git a/src/renderer/src/aiCore/clients/BaseApiClient.ts b/src/renderer/src/aiCore/clients/BaseApiClient.ts index d311ce2d6a..876c4e605f 100644 --- a/src/renderer/src/aiCore/clients/BaseApiClient.ts +++ b/src/renderer/src/aiCore/clients/BaseApiClient.ts @@ -254,7 +254,7 @@ export abstract class BaseApiClient< for (const fileBlock of textFileBlocks) { const file = fileBlock.file - const fileContent = (await window.api.file.read(file.id + file.ext)).trim() + const fileContent = (await window.api.file.read(file.id + file.ext, true)).trim() const fileNameRow = 'file: ' + file.origin_name + '\n\n' text = text + fileNameRow + fileContent + divider } diff --git a/src/renderer/src/aiCore/clients/anthropic/AnthropicAPIClient.ts b/src/renderer/src/aiCore/clients/anthropic/AnthropicAPIClient.ts index 93176a9566..73a5bb61c1 100644 --- a/src/renderer/src/aiCore/clients/anthropic/AnthropicAPIClient.ts +++ b/src/renderer/src/aiCore/clients/anthropic/AnthropicAPIClient.ts @@ -231,7 +231,7 @@ export class AnthropicAPIClient extends BaseApiClient< } }) } else { - const fileContent = await (await window.api.file.read(file.id + file.ext)).trim() + const fileContent = await (await window.api.file.read(file.id + file.ext, true)).trim() parts.push({ type: 'text', text: file.origin_name + '\n' + fileContent diff --git a/src/renderer/src/aiCore/clients/gemini/GeminiAPIClient.ts b/src/renderer/src/aiCore/clients/gemini/GeminiAPIClient.ts index bcf7c0d592..bd87ccc821 100644 --- a/src/renderer/src/aiCore/clients/gemini/GeminiAPIClient.ts +++ b/src/renderer/src/aiCore/clients/gemini/GeminiAPIClient.ts @@ -288,7 +288,7 @@ export class GeminiAPIClient extends BaseApiClient< continue } if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) { - const fileContent = await (await window.api.file.read(file.id + file.ext)).trim() + const fileContent = await (await window.api.file.read(file.id + file.ext, true)).trim() parts.push({ text: file.origin_name + '\n' + fileContent }) diff --git a/src/renderer/src/aiCore/clients/openai/OpenAIApiClient.ts b/src/renderer/src/aiCore/clients/openai/OpenAIApiClient.ts index e3ccc8edd0..b08f179fbc 100644 --- a/src/renderer/src/aiCore/clients/openai/OpenAIApiClient.ts +++ b/src/renderer/src/aiCore/clients/openai/OpenAIApiClient.ts @@ -307,7 +307,7 @@ export class OpenAIAPIClient extends OpenAIBaseClient< } if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) { - const fileContent = await (await window.api.file.read(file.id + file.ext)).trim() + const fileContent = await (await window.api.file.read(file.id + file.ext, true)).trim() parts.push({ type: 'text', text: file.origin_name + '\n' + fileContent diff --git a/src/renderer/src/aiCore/clients/openai/OpenAIResponseAPIClient.ts b/src/renderer/src/aiCore/clients/openai/OpenAIResponseAPIClient.ts index 898e7eec44..6de5f2f876 100644 --- a/src/renderer/src/aiCore/clients/openai/OpenAIResponseAPIClient.ts +++ b/src/renderer/src/aiCore/clients/openai/OpenAIResponseAPIClient.ts @@ -173,7 +173,7 @@ export class OpenAIResponseAPIClient extends OpenAIBaseClient< } if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) { - const fileContent = (await window.api.file.read(file.id + file.ext)).trim() + const fileContent = (await window.api.file.read(file.id + file.ext, true)).trim() parts.push({ type: 'input_text', text: file.origin_name + '\n' + fileContent diff --git a/src/renderer/src/services/TokenService.ts b/src/renderer/src/services/TokenService.ts index ebe4292f1d..e1b6d48b1d 100644 --- a/src/renderer/src/services/TokenService.ts +++ b/src/renderer/src/services/TokenService.ts @@ -19,7 +19,7 @@ async function getFileContent(file: FileMetadata) { } if (file.type === FileTypes.TEXT) { - return await window.api.file.read(file.id + file.ext) + return await window.api.file.read(file.id + file.ext, true) } return ''