mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2025-12-19 06:30:10 +08:00
feat: more encoding (#7898)
* feat(文件处理): 添加文件编码支持以正确处理不同编码的文本文件 添加文件编码检测和指定编码读取功能 - 在FileMetadata接口中添加encoding字段 - 添加iconv-lite和jschardet依赖用于编码处理和检测 - 文件上传时自动检测文本文件编码 - 文件读取时支持指定编码参数 - 更新所有API客户端以传递文件编码参数 * feat(文件处理): 添加文件编码检测和UTF-8读取功能 新增文件编码检测工具函数和UTF-8读取功能,统一处理不同编码的文件读取 移除重复的编码检测逻辑,优化代码结构 * refactor(FileStorage): 使用 readFileUTF8 替换 decodeBuffer 读取文件 移除冗余的 decodeBuffer 逻辑,直接使用封装好的 readFileUTF8 方法读取文件内容 * docs(utils): 为文件编码相关函数添加注释说明 添加对 detectEncoding、decodeBuffer 和 readFileUTF8 函数的详细注释,说明其功能和使用方法 * fix(utils): 为detectEncoding函数添加返回类型声明 * refactor(文件处理): 移除冗余的decodeBuffer函数并直接使用iconv.decode 简化文件读取逻辑,直接调用iconv.decode而不是通过中间函数decodeBuffer * test(file): 添加文件编码检测的测试用例 * test(文件编码检测): 移除ISO-8859-1编码的测试匹配 * refactor(file): 移除文件编码相关逻辑,统一使用UTF-8读取文本文件 移除FileMetadata接口中的encoding字段及相关检测逻辑 将所有文件读取操作统一改为使用readTextFileUTF8方法 * fix(文件读取): 改进文本文件解码逻辑以处理编码识别错误 当自动识别的编码包含错误字符时,尝试其他可能的编码 * refactor(utils): 将 console 日志替换为 electron-log 记录器 * refactor(文件存储): 移除文件读取时的可选编码参数 简化文件读取逻辑,始终使用UTF-8编码读取文本文件 * fix(utils): 修复文件编码检测中的文件描述符泄漏 在detectEncoding函数中,文件描述符在使用后未关闭,可能导致资源泄漏 * refactor(文件处理): 将readTextFileUTF8重命名为readTextFileWithAutoEncoding并改进编码检测 修复文件编码检测中未正确关闭文件描述符的问题 改进文本文件读取功能以支持自动编码检测 * test(file): 重构编码检测测试用例并改进测试结构 - 将 describe 块重命名为更明确的 detectEncoding - 提取公共的 mock 逻辑到 beforeEach - 更新测试描述为英文并保持一致性 - 简化测试实现,移除重复代码 * test(file): 添加对readTextFileWithAutoEncoding的测试用例
This commit is contained in:
parent
2b4ca03376
commit
05b8afd681
@ -63,6 +63,8 @@
|
||||
"@libsql/client": "0.14.0",
|
||||
"@libsql/win32-x64-msvc": "^0.4.7",
|
||||
"@strongtz/win32-arm64-msvc": "^0.4.7",
|
||||
"iconv-lite": "^0.6.3",
|
||||
"jschardet": "^3.1.4",
|
||||
"jsdom": "26.1.0",
|
||||
"macos-release": "^3.4.0",
|
||||
"node-stream-zip": "^1.15.0",
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
import * as fs from 'node:fs'
|
||||
|
||||
import { JsonLoader, LocalPathLoader, RAGApplication, TextLoader } from '@cherrystudio/embedjs'
|
||||
import type { AddLoaderReturn } from '@cherrystudio/embedjs-interfaces'
|
||||
import { WebLoader } from '@cherrystudio/embedjs-loader-web'
|
||||
import { readTextFileWithAutoEncoding } from '@main/utils/file'
|
||||
import { LoaderReturn } from '@shared/config/types'
|
||||
import { FileMetadata, KnowledgeBaseParams } from '@types'
|
||||
import Logger from 'electron-log'
|
||||
@ -115,7 +114,7 @@ export async function addFileLoader(
|
||||
// HTML类型处理
|
||||
loaderReturn = await ragApplication.addLoader(
|
||||
new WebLoader({
|
||||
urlOrContent: fs.readFileSync(file.path, 'utf-8'),
|
||||
urlOrContent: readTextFileWithAutoEncoding(file.path),
|
||||
chunkSize: base.chunkSize,
|
||||
chunkOverlap: base.chunkOverlap
|
||||
}) as any,
|
||||
@ -125,7 +124,7 @@ export async function addFileLoader(
|
||||
|
||||
case 'json':
|
||||
try {
|
||||
jsonObject = JSON.parse(fs.readFileSync(file.path, 'utf-8'))
|
||||
jsonObject = JSON.parse(readTextFileWithAutoEncoding(file.path))
|
||||
} catch (error) {
|
||||
jsonParsed = false
|
||||
Logger.warn('[KnowledgeBase] failed parsing json file, falling back to text processing:', file.path, error)
|
||||
@ -141,7 +140,7 @@ export async function addFileLoader(
|
||||
// 如果是其他文本类型且尚未读取文件,则读取文件
|
||||
loaderReturn = await ragApplication.addLoader(
|
||||
new TextLoader({
|
||||
text: fs.readFileSync(file.path, 'utf-8'),
|
||||
text: readTextFileWithAutoEncoding(file.path),
|
||||
chunkSize: base.chunkSize,
|
||||
chunkOverlap: base.chunkOverlap
|
||||
}) as any,
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { getFilesDir, getFileType, getTempDir } from '@main/utils/file'
|
||||
import { getFilesDir, getFileType, getTempDir, readTextFileWithAutoEncoding } from '@main/utils/file'
|
||||
import { documentExts, imageExts, MB } from '@shared/config/constant'
|
||||
import { FileMetadata } from '@types'
|
||||
import * as crypto from 'crypto'
|
||||
@ -188,6 +188,8 @@ class FileStorage {
|
||||
count: 1
|
||||
}
|
||||
|
||||
logger.info('[FileStorage] File uploaded:', fileMetadata)
|
||||
|
||||
return fileMetadata
|
||||
}
|
||||
|
||||
@ -256,7 +258,13 @@ class FileStorage {
|
||||
}
|
||||
}
|
||||
|
||||
return fs.readFileSync(filePath, 'utf8')
|
||||
try {
|
||||
const result = readTextFileWithAutoEncoding(filePath)
|
||||
return result
|
||||
} catch (error) {
|
||||
logger.error(error)
|
||||
return 'failed to read file'
|
||||
}
|
||||
}
|
||||
|
||||
public createTempFile = async (_: Electron.IpcMainInvokeEvent, fileName: string): Promise<string> => {
|
||||
|
||||
@ -3,8 +3,10 @@ import os from 'node:os'
|
||||
import path from 'node:path'
|
||||
|
||||
import { FileTypes } from '@types'
|
||||
import iconv from 'iconv-lite'
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
|
||||
|
||||
import { detectEncoding, readTextFileWithAutoEncoding } from '../file'
|
||||
import { getAllFiles, getAppConfigDir, getConfigDir, getFilesDir, getFileType, getTempDir } from '../file'
|
||||
|
||||
// Mock dependencies
|
||||
@ -241,4 +243,104 @@ describe('file', () => {
|
||||
expect(appConfigDir).toBe('/mock/home/.cherrystudio/config/')
|
||||
})
|
||||
})
|
||||
|
||||
// 在 describe('file') 块内部添加新的 describe 块
|
||||
describe('detectEncoding', () => {
|
||||
const mockFilePath = '/path/to/mock/file.txt'
|
||||
|
||||
beforeEach(() => {
|
||||
vi.mocked(fs.openSync).mockReturnValue(123)
|
||||
vi.mocked(fs.closeSync).mockImplementation(() => {})
|
||||
})
|
||||
|
||||
it('should correctly detect UTF-8 encoding', () => {
|
||||
// 准备UTF-8编码的Buffer
|
||||
const content = '这是UTF-8测试内容'
|
||||
const buffer = Buffer.from(content, 'utf-8')
|
||||
|
||||
// 模拟文件读取
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return 1024
|
||||
})
|
||||
|
||||
const encoding = detectEncoding(mockFilePath)
|
||||
expect(encoding).toBe('UTF-8')
|
||||
})
|
||||
|
||||
it('should correctly detect GB2312 encoding', () => {
|
||||
// 使用iconv创建GB2312编码内容
|
||||
const content = '这是一段GB2312编码的测试内容'
|
||||
const gb2312Buffer = iconv.encode(content, 'GB2312')
|
||||
|
||||
// 模拟文件读取
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(gb2312Buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return gb2312Buffer.length
|
||||
})
|
||||
|
||||
const encoding = detectEncoding(mockFilePath)
|
||||
expect(encoding).toMatch(/GB2312|GB18030/i)
|
||||
})
|
||||
|
||||
it('should correctly detect ASCII encoding', () => {
|
||||
// 准备ASCII编码内容
|
||||
const content = 'ASCII content'
|
||||
const buffer = Buffer.from(content, 'ascii')
|
||||
|
||||
// 模拟文件读取
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return buffer.length
|
||||
})
|
||||
|
||||
const encoding = detectEncoding(mockFilePath)
|
||||
expect(encoding.toLowerCase()).toBe('ascii')
|
||||
})
|
||||
})
|
||||
|
||||
describe('readTextFileWithAutoEncoding', () => {
|
||||
const mockFilePath = '/path/to/mock/file.txt'
|
||||
|
||||
beforeEach(() => {
|
||||
vi.mocked(fs.openSync).mockReturnValue(123)
|
||||
vi.mocked(fs.closeSync).mockImplementation(() => {})
|
||||
})
|
||||
|
||||
it('should read file with auto encoding', () => {
|
||||
const content = '这是一段GB2312编码的测试内容'
|
||||
const buffer = iconv.encode(content, 'GB2312')
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return buffer.length
|
||||
})
|
||||
vi.mocked(fs.readFileSync).mockReturnValue(buffer)
|
||||
|
||||
const result = readTextFileWithAutoEncoding(mockFilePath)
|
||||
expect(result).toBe(content)
|
||||
})
|
||||
|
||||
it('should try to fix bad detected encoding', () => {
|
||||
const content = '这是一段GB2312编码的测试内容'
|
||||
const buffer = iconv.encode(content, 'GB2312')
|
||||
vi.mocked(fs.readSync).mockImplementation((_, buf) => {
|
||||
const targetBuffer = new Uint8Array(buf.buffer)
|
||||
const sourceBuffer = new Uint8Array(buffer)
|
||||
targetBuffer.set(sourceBuffer)
|
||||
return buffer.length
|
||||
})
|
||||
vi.mocked(fs.readFileSync).mockReturnValue(buffer)
|
||||
vi.mocked(vi.fn(detectEncoding)).mockReturnValue('UTF-8')
|
||||
const result = readTextFileWithAutoEncoding(mockFilePath)
|
||||
expect(result).toBe(content)
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@ -6,6 +6,9 @@ import { isLinux, isPortable } from '@main/constant'
|
||||
import { audioExts, documentExts, imageExts, textExts, videoExts } from '@shared/config/constant'
|
||||
import { FileMetadata, FileTypes } from '@types'
|
||||
import { app } from 'electron'
|
||||
import Logger from 'electron-log'
|
||||
import iconv from 'iconv-lite'
|
||||
import { detect as detectEncoding_, detectAll as detectEncodingAll } from 'jschardet'
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
|
||||
export function initAppDataDir() {
|
||||
@ -202,3 +205,57 @@ export function getCacheDir() {
|
||||
export function getAppConfigDir(name: string) {
|
||||
return path.join(getConfigDir(), name)
|
||||
}
|
||||
|
||||
/**
|
||||
* 使用 jschardet 库检测文件编码格式
|
||||
* @param filePath - 文件路径
|
||||
* @returns 返回文件的编码格式,如 UTF-8, ascii, GB2312 等
|
||||
*/
|
||||
export function detectEncoding(filePath: string): string {
|
||||
// 读取文件前1KB来检测编码
|
||||
const buffer = Buffer.alloc(1024)
|
||||
const fd = fs.openSync(filePath, 'r')
|
||||
fs.readSync(fd, buffer, 0, 1024, 0)
|
||||
fs.closeSync(fd)
|
||||
const { encoding } = detectEncoding_(buffer)
|
||||
return encoding
|
||||
}
|
||||
|
||||
/**
|
||||
* 读取文件内容并自动检测编码格式进行解码
|
||||
* @param filePath - 文件路径
|
||||
* @returns 解码后的文件内容
|
||||
*/
|
||||
export function readTextFileWithAutoEncoding(filePath: string) {
|
||||
const encoding = detectEncoding(filePath)
|
||||
const data = fs.readFileSync(filePath)
|
||||
const content = iconv.decode(data, encoding)
|
||||
|
||||
if (content.includes('\uFFFD') && encoding !== 'UTF-8') {
|
||||
Logger.error(`文件 ${filePath} 自动识别编码为 ${encoding},但包含错误字符。尝试其他编码`)
|
||||
const buffer = Buffer.alloc(1024)
|
||||
const fd = fs.openSync(filePath, 'r')
|
||||
fs.readSync(fd, buffer, 0, 1024, 0)
|
||||
fs.closeSync(fd)
|
||||
const encodings = detectEncodingAll(buffer)
|
||||
if (encodings.length > 0) {
|
||||
for (const item of encodings) {
|
||||
if (item.encoding === encoding) {
|
||||
continue
|
||||
}
|
||||
Logger.log(`尝试使用 ${item.encoding} 解码文件 ${filePath}`)
|
||||
const content = iconv.decode(buffer, item.encoding)
|
||||
if (!content.includes('\uFFFD')) {
|
||||
Logger.log(`文件 ${filePath} 解码成功,编码为 ${item.encoding}`)
|
||||
return content
|
||||
} else {
|
||||
Logger.error(`文件 ${filePath} 使用 ${item.encoding} 解码失败,尝试下一个编码`)
|
||||
}
|
||||
}
|
||||
}
|
||||
Logger.error(`文件 ${filePath} 所有可能的编码均解码失败,尝试使用 UTF-8 解码`)
|
||||
return iconv.decode(buffer, 'UTF-8')
|
||||
}
|
||||
|
||||
return content
|
||||
}
|
||||
|
||||
@ -7165,7 +7165,9 @@ __metadata:
|
||||
html-to-image: "npm:^1.11.13"
|
||||
husky: "npm:^9.1.7"
|
||||
i18next: "npm:^23.11.5"
|
||||
iconv-lite: "npm:^0.6.3"
|
||||
jest-styled-components: "npm:^7.2.0"
|
||||
jschardet: "npm:^3.1.4"
|
||||
jsdom: "npm:26.1.0"
|
||||
lint-staged: "npm:^15.5.0"
|
||||
lodash: "npm:^4.17.21"
|
||||
@ -13052,6 +13054,13 @@ __metadata:
|
||||
languageName: node
|
||||
linkType: hard
|
||||
|
||||
"jschardet@npm:^3.1.4":
|
||||
version: 3.1.4
|
||||
resolution: "jschardet@npm:3.1.4"
|
||||
checksum: 10c0/d72c724ff60bc185d3962617ffda6849c6d632a935820841078c656a5247d73617a5df3b233e1fb1064de8683f7dae1b422b68186d1d6db22117b59edb5433dc
|
||||
languageName: node
|
||||
linkType: hard
|
||||
|
||||
"jsdom@npm:26.1.0":
|
||||
version: 26.1.0
|
||||
resolution: "jsdom@npm:26.1.0"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user