mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2025-12-26 03:31:24 +08:00
fix(encoding): encoding detection and decoding logic (#8024)
This commit is contained in:
parent
bcc1046cdf
commit
a7b78c547a
@ -114,7 +114,7 @@ export async function addFileLoader(
|
||||
// HTML类型处理
|
||||
loaderReturn = await ragApplication.addLoader(
|
||||
new WebLoader({
|
||||
urlOrContent: readTextFileWithAutoEncoding(file.path),
|
||||
urlOrContent: await readTextFileWithAutoEncoding(file.path),
|
||||
chunkSize: base.chunkSize,
|
||||
chunkOverlap: base.chunkOverlap
|
||||
}) as any,
|
||||
@ -124,7 +124,7 @@ export async function addFileLoader(
|
||||
|
||||
case 'json':
|
||||
try {
|
||||
jsonObject = JSON.parse(readTextFileWithAutoEncoding(file.path))
|
||||
jsonObject = JSON.parse(await readTextFileWithAutoEncoding(file.path))
|
||||
} catch (error) {
|
||||
jsonParsed = false
|
||||
Logger.warn('[KnowledgeBase] failed parsing json file, falling back to text processing:', file.path, error)
|
||||
@ -140,7 +140,7 @@ export async function addFileLoader(
|
||||
// 如果是其他文本类型且尚未读取文件,则读取文件
|
||||
loaderReturn = await ragApplication.addLoader(
|
||||
new TextLoader({
|
||||
text: readTextFileWithAutoEncoding(file.path),
|
||||
text: await readTextFileWithAutoEncoding(file.path),
|
||||
chunkSize: base.chunkSize,
|
||||
chunkOverlap: base.chunkOverlap
|
||||
}) as any,
|
||||
|
||||
@ -231,7 +231,11 @@ class FileStorage {
|
||||
await fs.promises.rm(path.join(this.storageDir, id), { recursive: true })
|
||||
}
|
||||
|
||||
public readFile = async (_: Electron.IpcMainInvokeEvent, id: string): Promise<string> => {
|
||||
public readFile = async (
|
||||
_: Electron.IpcMainInvokeEvent,
|
||||
id: string,
|
||||
detectEncoding: boolean = false
|
||||
): Promise<string> => {
|
||||
const filePath = path.join(this.storageDir, id)
|
||||
|
||||
const fileExtension = path.extname(filePath)
|
||||
@ -259,8 +263,11 @@ class FileStorage {
|
||||
}
|
||||
|
||||
try {
|
||||
const result = readTextFileWithAutoEncoding(filePath)
|
||||
return result
|
||||
if (detectEncoding) {
|
||||
return readTextFileWithAutoEncoding(filePath)
|
||||
} else {
|
||||
return fs.readFileSync(filePath, 'utf-8')
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(error)
|
||||
return 'failed to read file'
|
||||
|
||||
@ -1,16 +1,19 @@
|
||||
import * as fs from 'node:fs'
|
||||
import * as fsPromises from 'node:fs/promises'
|
||||
import os from 'node:os'
|
||||
import path from 'node:path'
|
||||
|
||||
import { FileTypes } from '@types'
|
||||
import iconv from 'iconv-lite'
|
||||
import { detectAll as detectEncodingAll } from 'jschardet'
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
|
||||
|
||||
import { detectEncoding, readTextFileWithAutoEncoding } from '../file'
|
||||
import { readTextFileWithAutoEncoding } from '../file'
|
||||
import { getAllFiles, getAppConfigDir, getConfigDir, getFilesDir, getFileType, getTempDir } from '../file'
|
||||
|
||||
// Mock dependencies
|
||||
vi.mock('node:fs')
|
||||
vi.mock('node:fs/promises')
|
||||
vi.mock('node:os')
|
||||
vi.mock('node:path')
|
||||
vi.mock('uuid', () => ({
|
||||
@ -244,102 +247,52 @@ describe('file', () => {
|
||||
})
|
||||
})
|
||||
|
||||
// 在 describe('file') 块内部添加新的 describe 块
|
||||
describe('detectEncoding', () => {
|
||||
const mockFilePath = '/path/to/mock/file.txt'
|
||||
|
||||
beforeEach(() => {
|
||||
vi.mocked(fs.openSync).mockReturnValue(123)
|
||||
vi.mocked(fs.closeSync).mockImplementation(() => {})
|
||||
})
|
||||
|
||||
it('should correctly detect UTF-8 encoding', () => {
|
||||
// 准备UTF-8编码的Buffer
|
||||
const content = '这是UTF-8测试内容'
|
||||
const buffer = Buffer.from(content, 'utf-8')
|
||||
|
||||
// 模拟文件读取
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return 1024
|
||||
})
|
||||
|
||||
const encoding = detectEncoding(mockFilePath)
|
||||
expect(encoding).toBe('UTF-8')
|
||||
})
|
||||
|
||||
it('should correctly detect GB2312 encoding', () => {
|
||||
// 使用iconv创建GB2312编码内容
|
||||
const content = '这是一段GB2312编码的测试内容'
|
||||
const gb2312Buffer = iconv.encode(content, 'GB2312')
|
||||
|
||||
// 模拟文件读取
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(gb2312Buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return gb2312Buffer.length
|
||||
})
|
||||
|
||||
const encoding = detectEncoding(mockFilePath)
|
||||
expect(encoding).toMatch(/GB2312|GB18030/i)
|
||||
})
|
||||
|
||||
it('should correctly detect ASCII encoding', () => {
|
||||
// 准备ASCII编码内容
|
||||
const content = 'ASCII content'
|
||||
const buffer = Buffer.from(content, 'ascii')
|
||||
|
||||
// 模拟文件读取
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return buffer.length
|
||||
})
|
||||
|
||||
const encoding = detectEncoding(mockFilePath)
|
||||
expect(encoding.toLowerCase()).toBe('ascii')
|
||||
})
|
||||
})
|
||||
|
||||
describe('readTextFileWithAutoEncoding', () => {
|
||||
const mockFilePath = '/path/to/mock/file.txt'
|
||||
|
||||
beforeEach(() => {
|
||||
vi.mocked(fs.openSync).mockReturnValue(123)
|
||||
vi.mocked(fs.closeSync).mockImplementation(() => {})
|
||||
})
|
||||
|
||||
it('should read file with auto encoding', () => {
|
||||
it('should read file with auto encoding', async () => {
|
||||
const content = '这是一段GB2312编码的测试内容'
|
||||
const buffer = iconv.encode(content, 'GB2312')
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return buffer.length
|
||||
})
|
||||
vi.mocked(fs.readFileSync).mockReturnValue(buffer)
|
||||
|
||||
const result = readTextFileWithAutoEncoding(mockFilePath)
|
||||
// 创建模拟的 FileHandle 对象
|
||||
const mockFileHandle = {
|
||||
read: vi.fn().mockResolvedValue({
|
||||
bytesRead: buffer.byteLength,
|
||||
buffer: buffer
|
||||
}),
|
||||
close: vi.fn().mockResolvedValue(undefined)
|
||||
}
|
||||
|
||||
// 模拟 open 方法
|
||||
vi.spyOn(fsPromises, 'open').mockResolvedValue(mockFileHandle as any)
|
||||
vi.spyOn(fsPromises, 'readFile').mockResolvedValue(buffer)
|
||||
|
||||
const result = await readTextFileWithAutoEncoding(mockFilePath)
|
||||
expect(result).toBe(content)
|
||||
})
|
||||
|
||||
it('should try to fix bad detected encoding', () => {
|
||||
it('should try to fix bad detected encoding', async () => {
|
||||
const content = '这是一段GB2312编码的测试内容'
|
||||
const buffer = iconv.encode(content, 'GB2312')
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return buffer.length
|
||||
})
|
||||
vi.mocked(fs.readFileSync).mockReturnValue(buffer)
|
||||
vi.mocked(vi.fn(detectEncoding)).mockReturnValue('UTF-8')
|
||||
const result = readTextFileWithAutoEncoding(mockFilePath)
|
||||
|
||||
// 创建模拟的 FileHandle 对象
|
||||
const mockFileHandle = {
|
||||
read: vi.fn().mockResolvedValue({
|
||||
bytesRead: buffer.byteLength,
|
||||
buffer: buffer
|
||||
}),
|
||||
close: vi.fn().mockResolvedValue(undefined)
|
||||
}
|
||||
|
||||
// 模拟 fs.open 方法
|
||||
vi.spyOn(fsPromises, 'open').mockResolvedValue(mockFileHandle as any)
|
||||
vi.spyOn(fsPromises, 'readFile').mockResolvedValue(buffer)
|
||||
vi.mocked(vi.fn(detectEncodingAll)).mockReturnValue([
|
||||
{ encoding: 'UTF-8', confidence: 0.9 },
|
||||
{ encoding: 'GB2312', confidence: 0.8 }
|
||||
])
|
||||
|
||||
const result = await readTextFileWithAutoEncoding(mockFilePath)
|
||||
expect(result).toBe(content)
|
||||
})
|
||||
})
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
import * as fs from 'node:fs'
|
||||
import { open, readFile } from 'node:fs/promises'
|
||||
import os from 'node:os'
|
||||
import path from 'node:path'
|
||||
|
||||
import { isLinux, isPortable } from '@main/constant'
|
||||
import { audioExts, documentExts, imageExts, textExts, videoExts } from '@shared/config/constant'
|
||||
import { audioExts, documentExts, imageExts, MB, textExts, videoExts } from '@shared/config/constant'
|
||||
import { FileMetadata, FileTypes } from '@types'
|
||||
import { app } from 'electron'
|
||||
import Logger from 'electron-log'
|
||||
import iconv from 'iconv-lite'
|
||||
import { detect as detectEncoding_, detectAll as detectEncodingAll } from 'jschardet'
|
||||
import * as jschardet from 'jschardet'
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
|
||||
export function initAppDataDir() {
|
||||
@ -206,56 +207,48 @@ export function getAppConfigDir(name: string) {
|
||||
return path.join(getConfigDir(), name)
|
||||
}
|
||||
|
||||
/**
|
||||
* 使用 jschardet 库检测文件编码格式
|
||||
* @param filePath - 文件路径
|
||||
* @returns 返回文件的编码格式,如 UTF-8, ascii, GB2312 等
|
||||
*/
|
||||
export function detectEncoding(filePath: string): string {
|
||||
// 读取文件前1KB来检测编码
|
||||
const buffer = Buffer.alloc(1024)
|
||||
const fd = fs.openSync(filePath, 'r')
|
||||
fs.readSync(fd, buffer, 0, 1024, 0)
|
||||
fs.closeSync(fd)
|
||||
const { encoding } = detectEncoding_(buffer)
|
||||
return encoding
|
||||
}
|
||||
|
||||
/**
|
||||
* 读取文件内容并自动检测编码格式进行解码
|
||||
* @param filePath - 文件路径
|
||||
* @returns 解码后的文件内容
|
||||
*/
|
||||
export function readTextFileWithAutoEncoding(filePath: string) {
|
||||
const encoding = detectEncoding(filePath)
|
||||
const data = fs.readFileSync(filePath)
|
||||
const content = iconv.decode(data, encoding)
|
||||
export async function readTextFileWithAutoEncoding(filePath: string): Promise<string> {
|
||||
// 读取前1MB以检测编码
|
||||
const buffer = Buffer.alloc(1 * MB)
|
||||
const fh = await open(filePath, 'r')
|
||||
const { buffer: bufferRead } = await fh.read(buffer, 0, 1 * MB, 0)
|
||||
await fh.close()
|
||||
|
||||
if (content.includes('\uFFFD') && encoding !== 'UTF-8') {
|
||||
Logger.error(`文件 ${filePath} 自动识别编码为 ${encoding},但包含错误字符。尝试其他编码`)
|
||||
const buffer = Buffer.alloc(1024)
|
||||
const fd = fs.openSync(filePath, 'r')
|
||||
fs.readSync(fd, buffer, 0, 1024, 0)
|
||||
fs.closeSync(fd)
|
||||
const encodings = detectEncodingAll(buffer)
|
||||
if (encodings.length > 0) {
|
||||
for (const item of encodings) {
|
||||
if (item.encoding === encoding) {
|
||||
continue
|
||||
}
|
||||
Logger.log(`尝试使用 ${item.encoding} 解码文件 ${filePath}`)
|
||||
const content = iconv.decode(buffer, item.encoding)
|
||||
if (!content.includes('\uFFFD')) {
|
||||
Logger.log(`文件 ${filePath} 解码成功,编码为 ${item.encoding}`)
|
||||
return content
|
||||
} else {
|
||||
Logger.error(`文件 ${filePath} 使用 ${item.encoding} 解码失败,尝试下一个编码`)
|
||||
}
|
||||
}
|
||||
}
|
||||
Logger.error(`文件 ${filePath} 所有可能的编码均解码失败,尝试使用 UTF-8 解码`)
|
||||
return iconv.decode(buffer, 'UTF-8')
|
||||
// 获取文件编码格式,最多取前两个可能的编码
|
||||
const encodings = jschardet
|
||||
.detectAll(bufferRead)
|
||||
.map((item) => ({
|
||||
...item,
|
||||
encoding: item.encoding === 'ascii' ? 'UTF-8' : item.encoding
|
||||
}))
|
||||
.filter((item, index, array) => array.findIndex((prevItem) => prevItem.encoding === item.encoding) === index)
|
||||
.slice(0, 2)
|
||||
|
||||
if (encodings.length === 0) {
|
||||
Logger.error('Failed to detect encoding. Use utf-8 to decode.')
|
||||
const data = await readFile(filePath)
|
||||
return iconv.decode(data, 'UTF-8')
|
||||
}
|
||||
|
||||
return content
|
||||
const data = await readFile(filePath)
|
||||
|
||||
for (const item of encodings) {
|
||||
const encoding = item.encoding
|
||||
const content = iconv.decode(data, encoding)
|
||||
if (content.includes('\uFFFD')) {
|
||||
Logger.error(
|
||||
`File ${filePath} was auto-detected as ${encoding} encoding, but contains invalid characters. Trying other encodings`
|
||||
)
|
||||
} else {
|
||||
return content
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error(`File ${filePath} failed to decode with all possible encodings, trying UTF-8 encoding`)
|
||||
return iconv.decode(data, 'UTF-8')
|
||||
}
|
||||
|
||||
@ -115,7 +115,8 @@ const api = {
|
||||
upload: (file: FileMetadata) => ipcRenderer.invoke(IpcChannel.File_Upload, file),
|
||||
delete: (fileId: string) => ipcRenderer.invoke(IpcChannel.File_Delete, fileId),
|
||||
deleteDir: (dirPath: string) => ipcRenderer.invoke(IpcChannel.File_DeleteDir, dirPath),
|
||||
read: (fileId: string) => ipcRenderer.invoke(IpcChannel.File_Read, fileId),
|
||||
read: (fileId: string, detectEncoding?: boolean) =>
|
||||
ipcRenderer.invoke(IpcChannel.File_Read, fileId, detectEncoding),
|
||||
clear: () => ipcRenderer.invoke(IpcChannel.File_Clear),
|
||||
get: (filePath: string) => ipcRenderer.invoke(IpcChannel.File_Get, filePath),
|
||||
/**
|
||||
|
||||
@ -254,7 +254,7 @@ export abstract class BaseApiClient<
|
||||
|
||||
for (const fileBlock of textFileBlocks) {
|
||||
const file = fileBlock.file
|
||||
const fileContent = (await window.api.file.read(file.id + file.ext)).trim()
|
||||
const fileContent = (await window.api.file.read(file.id + file.ext, true)).trim()
|
||||
const fileNameRow = 'file: ' + file.origin_name + '\n\n'
|
||||
text = text + fileNameRow + fileContent + divider
|
||||
}
|
||||
|
||||
@ -231,7 +231,7 @@ export class AnthropicAPIClient extends BaseApiClient<
|
||||
}
|
||||
})
|
||||
} else {
|
||||
const fileContent = await (await window.api.file.read(file.id + file.ext)).trim()
|
||||
const fileContent = await (await window.api.file.read(file.id + file.ext, true)).trim()
|
||||
parts.push({
|
||||
type: 'text',
|
||||
text: file.origin_name + '\n' + fileContent
|
||||
|
||||
@ -288,7 +288,7 @@ export class GeminiAPIClient extends BaseApiClient<
|
||||
continue
|
||||
}
|
||||
if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) {
|
||||
const fileContent = await (await window.api.file.read(file.id + file.ext)).trim()
|
||||
const fileContent = await (await window.api.file.read(file.id + file.ext, true)).trim()
|
||||
parts.push({
|
||||
text: file.origin_name + '\n' + fileContent
|
||||
})
|
||||
|
||||
@ -307,7 +307,7 @@ export class OpenAIAPIClient extends OpenAIBaseClient<
|
||||
}
|
||||
|
||||
if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) {
|
||||
const fileContent = await (await window.api.file.read(file.id + file.ext)).trim()
|
||||
const fileContent = await (await window.api.file.read(file.id + file.ext, true)).trim()
|
||||
parts.push({
|
||||
type: 'text',
|
||||
text: file.origin_name + '\n' + fileContent
|
||||
|
||||
@ -173,7 +173,7 @@ export class OpenAIResponseAPIClient extends OpenAIBaseClient<
|
||||
}
|
||||
|
||||
if ([FileTypes.TEXT, FileTypes.DOCUMENT].includes(file.type)) {
|
||||
const fileContent = (await window.api.file.read(file.id + file.ext)).trim()
|
||||
const fileContent = (await window.api.file.read(file.id + file.ext, true)).trim()
|
||||
parts.push({
|
||||
type: 'input_text',
|
||||
text: file.origin_name + '\n' + fileContent
|
||||
|
||||
@ -19,7 +19,7 @@ async function getFileContent(file: FileMetadata) {
|
||||
}
|
||||
|
||||
if (file.type === FileTypes.TEXT) {
|
||||
return await window.api.file.read(file.id + file.ext)
|
||||
return await window.api.file.read(file.id + file.ext, true)
|
||||
}
|
||||
|
||||
return ''
|
||||
|
||||
Loading…
Reference in New Issue
Block a user