From a7b78c547ae626eb11e56d2131f1690b0ab59bb1 Mon Sep 17 00:00:00 2001
From: Phantom <59059173+EurFelux@users.noreply.github.com>
Date: Thu, 10 Jul 2025 22:13:40 +0800
Subject: [PATCH] fix(encoding): encoding detection and decoding logic (#8024)

---
 src/main/knowledage/loader/index.ts           |   6 +-
 src/main/services/FileStorage.ts              |  13 +-
 src/main/utils/__tests__/file.test.ts         | 125 ++++++------------
 src/main/utils/file.ts                        |  85 ++++++------
 src/preload/index.ts                          |   3 +-
 .../src/aiCore/clients/BaseApiClient.ts       |   2 +-
 .../clients/anthropic/AnthropicAPIClient.ts   |   2 +-
 .../aiCore/clients/gemini/GeminiAPIClient.ts  |   2 +-
 .../aiCore/clients/openai/OpenAIApiClient.ts  |   2 +-
 .../clients/openai/OpenAIResponseAPIClient.ts |   2 +-
 src/renderer/src/services/TokenService.ts     |   2 +-
 11 files changed, 99 insertions(+), 145 deletions(-)
diff --git a/src/main/knowledage/loader/index.ts b/src/main/knowledage/loader/index.ts
index f86df65dba..5fba26436e 100644
--- a/src/main/knowledage/loader/index.ts
+++ b/src/main/knowledage/loader/index.ts
@@ -114,7 +114,7 @@ export async function addFileLoader(
       // HTML类型处理
       loaderReturn = await ragApplication.addLoader(
         new WebLoader({
-          urlOrContent: readTextFileWithAutoEncoding(file.path),
+          urlOrContent: await readTextFileWithAutoEncoding(file.path),
           chunkSize: base.chunkSize,
           chunkOverlap: base.chunkOverlap
         }) as any,
@@ -124,7 +124,7 @@ export async function addFileLoader(
 
     case 'json':
       try {
-        jsonObject = JSON.parse(readTextFileWithAutoEncoding(file.path))
+        jsonObject = JSON.parse(await readTextFileWithAutoEncoding(file.path))
       } catch (error) {
         jsonParsed = false
         Logger.warn('[KnowledgeBase] failed parsing json file, falling back to text processing:', file.path, error)
@@ -140,7 +140,7 @@ export async function addFileLoader(
       // 如果是其他文本类型且尚未读取文件，则读取文件
       loaderReturn = await ragApplication.addLoader(
         new TextLoader({
-          text: readTextFileWithAutoEncoding(file.path),
+          text: await readTextFileWithAutoEncoding(file.path),
           chunkSize: base.chunkSize,
           chunkOverlap: base.chunkOverlap
         }) as any,
diff --git a/src/main/services/FileStorage.ts b/src/main/services/FileStorage.ts
index 87d465e5f4..b7a80c7f0b 100644
--- a/src/main/services/FileStorage.ts
+++ b/src/main/services/FileStorage.ts
@@ -231,7 +231,11 @@ class FileStorage {
     await fs.promises.rm(path.join(this.storageDir, id), { recursive: true })
   }
 
-  public readFile = async (_: Electron.IpcMainInvokeEvent, id: string): Promise<string> => {
+  public readFile = async (
+    _: Electron.IpcMainInvokeEvent,
+    id: string,
+    detectEncoding: boolean = false
+  ): Promise<string> => {
     const filePath = path.join(this.storageDir, id)
 
     const fileExtension = path.extname(filePath)
@@ -259,8 +263,11 @@ class FileStorage {
     }
 
     try {
-      const result = readTextFileWithAutoEncoding(filePath)
-      return result
+      if (detectEncoding) {
+        return readTextFileWithAutoEncoding(filePath)
+      } else {
+        return fs.readFileSync(filePath, 'utf-8')
+      }
     } catch (error) {
       logger.error(error)
       return 'failed to read file'
diff --git a/src/main/utils/__tests__/file.test.ts b/src/main/utils/__tests__/file.test.ts
index 6066729dc7..fbd734fd3d 100644
--- a/src/main/utils/__tests__/file.test.ts
+++ b/src/main/utils/__tests__/file.test.ts
@@ -1,16 +1,19 @@
 import * as fs from 'node:fs'
+import * as fsPromises from 'node:fs/promises'
 import os from 'node:os'
 import path from 'node:path'
 
 import { FileTypes } from '@types'
 import iconv from 'iconv-lite'
+import { detectAll as detectEncodingAll } from 'jschardet'
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
 
-import { detectEncoding, readTextFileWithAutoEncoding } from '../file'
+import { readTextFileWithAutoEncoding } from '../file'
 import { getAllFiles, getAppConfigDir, getConfigDir, getFilesDir, getFileType, getTempDir } from '../file'
 
 // Mock dependencies
 vi.mock('node:fs')
+vi.mock('node:fs/promises')
 vi.mock('node:os')
 vi.mock('node:path')
 vi.mock('uuid', () => ({
@@ -244,102 +247,52 @@ describe('file', () => {
     })
   })
 
-  // 在 describe('file') 块内部添加新的 describe 块
-  describe('detectEncoding', () => {
-    const mockFilePath = '/path/to/mock/file.txt'
-
-    beforeEach(() => {
-      vi.mocked(fs.openSync).mockReturnValue(123)
-      vi.mocked(fs.closeSync).mockImplementation(() => {})
-    })
-
-    it('should correctly detect UTF-8 encoding', () => {
-      // 准备UTF-8编码的Buffer
-      const content = '这是UTF-8测试内容'
-      const buffer = Buffer.from(content, 'utf-8')
-
-      // 模拟文件读取
-      vi.mocked(fs.readSync).mockImplementation((_, buf) => {
-        const targetBuffer = new Uint8Array(buf.buffer)
-        const sourceBuffer = new Uint8Array(buffer)
-        targetBuffer.set(sourceBuffer)
-        return 1024
-      })
-
-      const encoding = detectEncoding(mockFilePath)
-      expect(encoding).toBe('UTF-8')
-    })
-
-    it('should correctly detect GB2312 encoding', () => {
-      // 使用iconv创建GB2312编码内容
-      const content = '这是一段GB2312编码的测试内容'
-      const gb2312Buffer = iconv.encode(content, 'GB2312')
-
-      // 模拟文件读取
-      vi.mocked(fs.readSync).mockImplementation((_, buf) => {
-        const targetBuffer = new Uint8Array(buf.buffer)
-        const sourceBuffer = new Uint8Array(gb2312Buffer)
-        targetBuffer.set(sourceBuffer)
-        return gb2312Buffer.length
-      })
-
-      const encoding = detectEncoding(mockFilePath)
-      expect(encoding).toMatch(/GB2312|GB18030/i)
-    })
-
-    it('should correctly detect ASCII encoding', () => {
-      // 准备ASCII编码内容
-      const content = 'ASCII content'
-      const buffer = Buffer.from(content, 'ascii')
-
-      // 模拟文件读取
-      vi.mocked(fs.readSync).mockImplementation((_, buf) => {
-        const targetBuffer = new Uint8Array(buf.buffer)
-        const sourceBuffer = new Uint8Array(buffer)
-        targetBuffer.set(sourceBuffer)
-        return buffer.length
-      })
-
-      const encoding = detectEncoding(mockFilePath)
-      expect(encoding.toLowerCase()).toBe('ascii')
-    })
-  })
-
   describe('readTextFileWithAutoEncoding', () => {
     const mockFilePath = '/path/to/mock/file.txt'
 
-    beforeEach(() => {
-      vi.mocked(fs.openSync).mockReturnValue(123)
-      vi.mocked(fs.closeSync).mockImplementation(() => {})
-    })
-
-    it('should read file with auto encoding', () => {
+    it('should read file with auto encoding', async () => {
       const content = '这是一段GB2312编码的测试内容'
       const buffer = iconv.encode(content, 'GB2312')
-      vi.mocked(fs.readSync).mockImplementation((_, buf) => {
-        const targetBuffer = new Uint8Array(buf.buffer)
-        const sourceBuffer = new Uint8Array(buffer)
-        targetBuffer.set(sourceBuffer)
-        return buffer.length
-      })
-      vi.mocked(fs.readFileSync).mockReturnValue(buffer)
 
-      const result = readTextFileWithAutoEncoding(mockFilePath)
+      // 创建模拟的 FileHandle 对象
+      const mockFileHandle = {
+        read: vi.fn().mockResolvedValue({
+          bytesRead: buffer.byteLength,
+          buffer: buffer
+        }),
+        close: vi.fn().mockResolvedValue(undefined)
+      }
+
+      // 模拟 open 方法
+      vi.spyOn(fsPromises, 'open').mockResolvedValue(mockFileHandle as any)
+      vi.spyOn(fsPromises, 'readFile').mockResolvedValue(buffer)
+
+      const result = await readTextFileWithAutoEncoding(mockFilePath)
       expect(result).toBe(content)
     })
 
-    it('should try to fix bad detected encoding', () => {
+    it('should try to fix bad detected encoding', async () => {
       const content = '这是一段GB2312编码的测试内容'
       const buffer = iconv.encode(content, 'GB2312')
-      vi.mocked(fs.readSync).mockImplementation((_, buf) => {
-        const targetBuffer = new Uint8Array(buf.buffer)
-        const sourceBuffer = new Uint8Array(buffer)
-        targetBuffer.set(sourceBuffer)
-        return buffer.length
-      })
-      vi.mocked(fs.readFileSync).mockReturnValue(buffer)
-      vi.mocked(vi.fn(detectEncoding)).mockReturnValue('UTF-8')
-      const result = readTextFileWithAutoEncoding(mockFilePath)
+
+      // 创建模拟的 FileHandle 对象
+      const mockFileHandle = {
+        read: vi.fn().mockResolvedValue({
+          bytesRead: buffer.byteLength,
+          buffer: buffer
+        }),
+        close: vi.fn().mockResolvedValue(undefined)
+      }
+
+      // 模拟 fs.open 方法
+      vi.spyOn(fsPromises, 'open').mockResolvedValue(mockFileHandle as any)
+      vi.spyOn(fsPromises, 'readFile').mockResolvedValue(buffer)
+      vi.mocked(vi.fn(detectEncodingAll)).mockReturnValue([
+        { encoding: 'UTF-8', confidence: 0.9 },
+        { encoding: 'GB2312', confidence: 0.8 }
+      ])
+
+      const result = await readTextFileWithAutoEncoding(mockFilePath)
       expect(result).toBe(content)
     })
   })
diff --git a/src/main/utils/file.ts b/src/main/utils/file.ts
index baba7ec8ba..4cf8bd0e44 100644
--- a/src/main/utils/file.ts
+++ b/src/main/utils/file.ts
@@ -1,14 +1,15 @@
 import * as fs from 'node:fs'
+import { open, readFile } from 'node:fs/promises'
 import os from 'node:os'
 import path from 'node:path'
 
 import { isLinux, isPortable } from '@main/constant'
-import { audioExts, documentExts, imageExts, textExts, videoExts } from '@shared/config/constant'
+import { audioExts, documentExts, imageExts, MB, textExts, videoExts } from '@shared/config/constant'
 import { FileMetadata, FileTypes } from '@types'
 import { app } from 'electron'
 import Logger from 'electron-log'
 import iconv from 'iconv-lite'
-import { detect as detectEncoding_, detectAll as detectEncodingAll } from 'jschardet'
+import * as jschardet from 'jschardet'
 import { v4 as uuidv4 } from 'uuid'
 
 export function initAppDataDir() {
@@ -206,56 +207,48 @@ export function getAppConfigDir(name: string) {
   return path.join(getConfigDir(), name)
 }
 
-/**
- * 使用 jschardet 库检测文件编码格式
- * @param filePath - 文件路径
- * @returns 返回文件的编码格式，如 UTF-8, ascii, GB2312 等
- */
-export function detectEncoding(filePath: string): string {
-  // 读取文件前1KB来检测编码
-  const buffer = Buffer.alloc(1024)
-  const fd = fs.openSync(filePath, 'r')
-  fs.readSync(fd, buffer, 0, 1024, 0)
-  fs.closeSync(fd)
-  const { encoding } = detectEncoding_(buffer)
-  return encoding
-}
-
 /**
  * 读取文件内容并自动检测编码格式进行解码
  * @param filePath - 文件路径
  * @returns 解码后的文件内容
  */
-export function readTextFileWithAutoEncoding(filePath: string) {
-  const encoding = detectEncoding(filePath)
-  const data = fs.readFileSync(filePath)
-  const content = iconv.decode(data, encoding)
+export async function readTextFileWithAutoEncoding(filePath: string): Promise<string> {
+  // 读取前1MB以检测编码
+  const buffer = Buffer.alloc(1 * MB)
+  const fh = await open(filePath, 'r')
+  const { buffer: bufferRead } = await fh.read(buffer, 0, 1 * MB, 0)
+  await fh.close()
 
-  if (content.includes('\uFFFD') && encoding !== 'UTF-8') {
-    Logger.error(`文件 ${filePath} 自动识别编码为 ${encoding}，但包含错误字符。尝试其他编码`)
-    const buffer = Buffer.alloc(1024)
-    const fd = fs.openSync(filePath, 'r')
-    fs.readSync(fd, buffer, 0, 1024, 0)
-    fs.closeSync(fd)
-    const encodings = detectEncodingAll(buffer)
-    if (encodings.length > 0) {
-      for (const item of encodings) {
-        if (item.encoding === encoding) {
-          continue
-        }
-        Logger.log(`尝试使用 ${item.encoding} 解码文件 ${filePath}`)
-        const content = iconv.decode(buffer, item.encoding)
-        if (!content.includes('\uFFFD')) {
-          Logger.log(`文件 ${filePath} 解码成功，编码为 ${item.encoding}`)
-          return content
-        } else {
-          Logger.error(`文件 ${filePath} 使用 ${item.encoding} 解码失败，尝试下一个编码`)
-        }
-      }
-    }
-    Logger.error(`文件 ${filePath} 所有可能的编码均解码失败，尝试使用 UTF-8 解码`)
-    return iconv.decode(buffer, 'UTF-8')
+  // 获取文件编码格式，最多取前两个可能的编码
+  const encodings = jschardet
+    .detectAll(bufferRead)
+    .map((item) => ({
+      ...item,
+      encoding: item.encoding === 'ascii' ? 'UTF-8' : item.encoding
+    }))
+    .filter((item, index, array) => array.findIndex((prevItem) => prevItem.encoding === item.encoding) === index)
+    .slice(0, 2)
+
+  if (encodings.length === 0) {
+    Logger.error('Failed to detect encoding. Use utf-8 to decode.')
+    const data = await readFile(filePath)
+    return iconv.decode(data, 'UTF-8')
   }
 
-  return content
+  const data = await readFile(filePath)
+
+  for (const item of encodings) {
+    const encoding = item.encoding
+    const content = iconv.decode(data, encoding)
+    if (content.includes('\uFFFD')) {
+      Logger.error(
+        `File ${filePath} was auto-detected as ${encoding} encoding, but contains invalid characters. Trying other encodings`
+      )
+    } else {
+      return content
+    }
+  }
+
+  Logger.error(`File ${filePath} failed to decode with all possible encodings, trying UTF-8 encoding`)
+  return iconv.decode(data, 'UTF-8')
 }
diff --git a/src/preload/index.ts b/src/preload/index.ts
index 8fc104b68e..ca88949e70 100644
--- a/src/preload/index.ts
+++ b/src/preload/index.ts
@@ -115,7 +115,8 @@ const api = {
     upload: (file: FileMetadata) => ipcRenderer.invoke(IpcChannel.File_Upload, file),
     delete: (fileId: string) => ipcRenderer.invoke(IpcChannel.File_Delete, fileId),
     deleteDir: (dirPath: string) => ipcRenderer.invoke(IpcChannel.File_DeleteDir, dirPath),
-    read: (fileId: string) => ipcRenderer.invoke(IpcChannel.File_Read, fileId),
+    read: (fileId: string, detectEncoding?: boolean) =>
+      ipcRenderer.invoke(IpcChannel.File_Read, fileId, detectEncoding),
     clear: () => ipcRenderer.invoke(IpcChannel.File_Clear),
     get: (filePath: string) => ipcRenderer.invoke(IpcChannel.File_Get, filePath),
     /**
diff --git a/src/renderer/src/aiCore/clients/BaseApiClient.ts b/src/renderer/src/aiCore/clients/BaseApiClient.ts
index d311ce2d6a..876c4e605f 100644
--- a/src/renderer/src/aiCore/clients/BaseApiClient.ts
+++ b/src/renderer/src/aiCore/clients/BaseApiClient.ts
@@ -254,7 +254,7 @@ export abstract class BaseApiClient<
 
         for (const fileBlock of textFileBlocks) {
           const file = fileBlock.file
-          const fileContent = (await window.api.file.read(file.id + file.ext)).trim()
+          const fileContent = (await window.api.file.read(file.id + file.ext, true)).trim()
           const fileNameRow = 'file: ' + file.origin_name + '\n\n'
           text = text + fileNameRow + fileContent + divider
         }
diff --git a/src/renderer/src/aiCore/clients/anthropic/AnthropicAPIClient.ts b/src/renderer/src/aiCore/clients/anthropic/AnthropicAPIClient.ts
index 93176a9566..73a5bb61c1 100644
--- a/src/renderer/src/aiCore/clients/anthropic/AnthropicAPIClient.ts
+++ b/src/renderer/src/aiCore/clients/anthropic/AnthropicAPIClient.ts
@@ -231,7 +231,7 @@ export class AnthropicAPIClient extends BaseApiClient<
             }
           })
         } else {
-          const fileContent = await (await window.api.file.read(file.id + file.ext)).trim()
+          const fileContent = await (await window.api.file.read(file.id + file.ext, true)).trim()
           parts.push({
             type: 'text',
             text: file.origin_name + '\n' + fileContent
diff --git a/src/renderer/src/aiCore/clients/gemini/GeminiAPIClient.ts b/src/renderer/src/aiCore/clients/gemini/GeminiAPIClient.ts
index bcf7c0d592..bd87ccc821 100644
--- a/src/renderer/src/aiCore/clients/gemini/GeminiAPIClient.ts
+++ b/src/renderer/src/aiCore/clients/gemini/GeminiAPIClient.ts
@@ -288,7 +288,7 @@ export class GeminiAPIClient extends BaseApiClient<
         continue
       }
       if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) {
-        const fileContent = await (await window.api.file.read(file.id + file.ext)).trim()
+        const fileContent = await (await window.api.file.read(file.id + file.ext, true)).trim()
         parts.push({
           text: file.origin_name + '\n' + fileContent
         })
diff --git a/src/renderer/src/aiCore/clients/openai/OpenAIApiClient.ts b/src/renderer/src/aiCore/clients/openai/OpenAIApiClient.ts
index e3ccc8edd0..b08f179fbc 100644
--- a/src/renderer/src/aiCore/clients/openai/OpenAIApiClient.ts
+++ b/src/renderer/src/aiCore/clients/openai/OpenAIApiClient.ts
@@ -307,7 +307,7 @@ export class OpenAIAPIClient extends OpenAIBaseClient<
       }
 
       if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) {
-        const fileContent = await (await window.api.file.read(file.id + file.ext)).trim()
+        const fileContent = await (await window.api.file.read(file.id + file.ext, true)).trim()
         parts.push({
           type: 'text',
           text: file.origin_name + '\n' + fileContent
diff --git a/src/renderer/src/aiCore/clients/openai/OpenAIResponseAPIClient.ts b/src/renderer/src/aiCore/clients/openai/OpenAIResponseAPIClient.ts
index 898e7eec44..6de5f2f876 100644
--- a/src/renderer/src/aiCore/clients/openai/OpenAIResponseAPIClient.ts
+++ b/src/renderer/src/aiCore/clients/openai/OpenAIResponseAPIClient.ts
@@ -173,7 +173,7 @@ export class OpenAIResponseAPIClient extends OpenAIBaseClient<
       }
 
       if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) {
-        const fileContent = (await window.api.file.read(file.id + file.ext)).trim()
+        const fileContent = (await window.api.file.read(file.id + file.ext, true)).trim()
         parts.push({
           type: 'input_text',
           text: file.origin_name + '\n' + fileContent
diff --git a/src/renderer/src/services/TokenService.ts b/src/renderer/src/services/TokenService.ts
index ebe4292f1d..e1b6d48b1d 100644
--- a/src/renderer/src/services/TokenService.ts
+++ b/src/renderer/src/services/TokenService.ts
@@ -19,7 +19,7 @@ async function getFileContent(file: FileMetadata) {
   }
 
   if (file.type === FileTypes.TEXT) {
-    return await window.api.file.read(file.id + file.ext)
+    return await window.api.file.read(file.id + file.ext, true)
   }
 
   return ''