fix: change jschardet to chardet (#8577)

* fix: change jschardet to chardet

* Update file.test.ts

* fix: error

* fix: test fail

* fix: test error

* Update file.test.ts

* fix: optimize details

* Update file.test.ts

* Update file.ts

* Update file.ts

* Update file.test.ts
This commit is contained in:
自由的世界人 2025-07-29 17:27:36 +08:00 committed by GitHub
parent 7098489f15
commit 27af64f2bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 44 additions and 80 deletions

View File

@ -168,6 +168,7 @@
"async-mutex": "^0.5.0",
"axios": "^1.7.3",
"browser-image-compression": "^2.0.2",
"chardet": "^2.1.0",
"cli-progress": "^3.12.0",
"code-inspector-plugin": "^0.20.14",
"color": "^5.0.0",
@ -204,7 +205,6 @@
"iconv-lite": "^0.6.3",
"jaison": "^2.0.2",
"jest-styled-components": "^7.2.0",
"jschardet": "^3.1.4",
"linguist-languages": "^8.0.0",
"lint-staged": "^15.5.0",
"lodash": "^4.17.21",

View File

@ -38,7 +38,7 @@ import { IpcChannel } from '@shared/IpcChannel'
import { FileMetadata, KnowledgeBaseParams, KnowledgeItem } from '@types'
import { v4 as uuidv4 } from 'uuid'
const logger = loggerService.withContext('KnowledgeService')
const logger = loggerService.withContext('MainKnowledgeService')
export interface KnowledgeBaseAddItemOptions {
base: KnowledgeBaseParams

View File

@ -1,3 +1,4 @@
import { loggerService } from '@logger'
import { isDev } from '@main/constant'
import { CacheBatchSpanProcessor, FunctionSpanExporter } from '@mcp-trace/trace-core'
import { NodeTracer as MCPNodeTracer } from '@mcp-trace/trace-node/nodeTracer'
@ -6,7 +7,6 @@ import { BrowserWindow, ipcMain } from 'electron'
import * as path from 'path'
import { ConfigKeys, configManager } from './ConfigManager'
import { loggerService } from './LoggerService'
import { spanCacheService } from './SpanCacheService'
export const TRACER_NAME = 'CherryStudio'

View File

@ -4,8 +4,8 @@ import os from 'node:os'
import path from 'node:path'
import { FileTypes } from '@types'
import chardet from 'chardet'
import iconv from 'iconv-lite'
import { detectAll as detectEncodingAll } from 'jschardet'
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
import { readTextFileWithAutoEncoding } from '../file'
@ -260,46 +260,24 @@ describe('file', () => {
const mockFilePath = '/path/to/mock/file.txt'
it('should read file with auto encoding', async () => {
const content = '这是一段GB2312编码的测试内容'
const buffer = iconv.encode(content, 'GB2312')
const content = '这是一段GB18030编码的测试内容'
const buffer = iconv.encode(content, 'GB18030')
// 创建模拟的 FileHandle 对象
const mockFileHandle = {
read: vi.fn().mockResolvedValue({
bytesRead: buffer.byteLength,
buffer: buffer
}),
close: vi.fn().mockResolvedValue(undefined)
}
// 模拟 open 方法
vi.spyOn(fsPromises, 'open').mockResolvedValue(mockFileHandle as any)
// 模拟文件读取和编码检测
vi.spyOn(fsPromises, 'readFile').mockResolvedValue(buffer)
vi.spyOn(chardet, 'detectFile').mockResolvedValue('GB18030')
const result = await readTextFileWithAutoEncoding(mockFilePath)
expect(result).toBe(content)
})
it('should try to fix bad detected encoding', async () => {
const content = '这是一段GB2312编码的测试内容'
const buffer = iconv.encode(content, 'GB2312')
const content = '这是一段UTF-8编码的测试内容'
const buffer = iconv.encode(content, 'UTF-8')
// 创建模拟的 FileHandle 对象
const mockFileHandle = {
read: vi.fn().mockResolvedValue({
bytesRead: buffer.byteLength,
buffer: buffer
}),
close: vi.fn().mockResolvedValue(undefined)
}
// 模拟 fs.open 方法
vi.spyOn(fsPromises, 'open').mockResolvedValue(mockFileHandle as any)
// 模拟文件读取
vi.spyOn(fsPromises, 'readFile').mockResolvedValue(buffer)
vi.mocked(vi.fn(detectEncodingAll)).mockReturnValue([
{ encoding: 'UTF-8', confidence: 0.9 },
{ encoding: 'GB2312', confidence: 0.8 }
])
vi.spyOn(chardet, 'detectFile').mockResolvedValue('GB18030')
const result = await readTextFileWithAutoEncoding(mockFilePath)
expect(result).toBe(content)

View File

@ -1,14 +1,14 @@
import * as fs from 'node:fs'
import { open, readFile } from 'node:fs/promises'
import { readFile } from 'node:fs/promises'
import os from 'node:os'
import path from 'node:path'
import { loggerService } from '@logger'
import { audioExts, documentExts, imageExts, MB, textExts, videoExts } from '@shared/config/constant'
import { FileMetadata, FileTypes } from '@types'
import chardet from 'chardet'
import { app } from 'electron'
import iconv from 'iconv-lite'
import * as jschardet from 'jschardet'
import { v4 as uuidv4 } from 'uuid'
const logger = loggerService.withContext('Utils:File')
@ -170,39 +170,24 @@ export function getMcpDir() {
* @returns
*/
export async function readTextFileWithAutoEncoding(filePath: string): Promise<string> {
// 读取前1MB以检测编码
const buffer = Buffer.alloc(1 * MB)
const fh = await open(filePath, 'r')
const { buffer: bufferRead } = await fh.read(buffer, 0, 1 * MB, 0)
await fh.close()
// 获取文件编码格式,最多取前两个可能的编码
const encodings = jschardet
.detectAll(bufferRead)
.map((item) => ({
...item,
encoding: item.encoding === 'ascii' ? 'UTF-8' : item.encoding
}))
.filter((item, index, array) => array.findIndex((prevItem) => prevItem.encoding === item.encoding) === index)
.slice(0, 2)
if (encodings.length === 0) {
logger.error('Failed to detect encoding. Use utf-8 to decode.')
const data = await readFile(filePath)
return iconv.decode(data, 'UTF-8')
}
const encoding = (await chardet.detectFile(filePath, { sampleSize: MB })) || 'UTF-8'
logger.debug(`File ${filePath} detected encoding: ${encoding}`)
const encodings = [encoding, 'UTF-8']
const data = await readFile(filePath)
for (const item of encodings) {
const encoding = item.encoding
const content = iconv.decode(data, encoding)
if (content.includes('\uFFFD')) {
logger.error(
`File ${filePath} was auto-detected as ${encoding} encoding, but contains invalid characters. Trying other encodings`
)
} else {
return content
for (const encoding of encodings) {
try {
const content = iconv.decode(data, encoding)
if (!content.includes('\uFFFD')) {
return content
} else {
logger.warn(
`File ${filePath} was auto-detected as ${encoding} encoding, but contains invalid characters. Trying other encodings`
)
}
} catch (error) {
logger.error(`Failed to decode file ${filePath} with encoding ${encoding}: ${error}`)
}
}

View File

@ -1,13 +1,14 @@
import Anthropic from '@anthropic-ai/sdk'
import AnthropicVertex from '@anthropic-ai/vertex-sdk'
import { loggerService } from '@logger'
import { getVertexAILocation, getVertexAIProjectId, getVertexAIServiceAccount } from '@renderer/hooks/useVertexAI'
import { loggerService } from '@renderer/services/LoggerService'
import { Provider } from '@renderer/types'
import { isEmpty } from 'lodash'
const logger = loggerService.withContext('AnthropicVertexClient')
import { AnthropicAPIClient } from './AnthropicAPIClient'
const logger = loggerService.withContext('AnthropicVertexClient')
export class AnthropicVertexClient extends AnthropicAPIClient {
sdkInstance: AnthropicVertex | undefined = undefined
private authHeaders?: Record<string, string>

View File

@ -1,6 +1,6 @@
import { loggerService } from '@logger'
import { db } from '@renderer/databases'
import { getDefaultTopic } from '@renderer/services/AssistantService'
import { loggerService } from '@renderer/services/LoggerService'
import { useAppDispatch, useAppSelector } from '@renderer/store'
import {
addAssistant,

View File

@ -1,4 +1,4 @@
import { loggerService } from '@renderer/services/LoggerService'
import { loggerService } from '@logger'
import { defaultLanguage } from '@shared/config/constant'
import i18n from 'i18next'
import { initReactI18next } from 'react-i18next'

View File

@ -195,7 +195,7 @@ class KnowledgeQueue {
updateBaseItemIsPreprocessed({
baseId,
itemId: item.id,
isPreprocessed: base.preprocessOrOcrProvider ? true : false
isPreprocessed: !!base.preprocessOrOcrProvider
})
)
}

View File

@ -13,7 +13,7 @@ import { isEmpty } from 'lodash'
import { getProviderByModel } from './AssistantService'
import FileManager from './FileManager'
const logger = loggerService.withContext('KnowledgeService')
const logger = loggerService.withContext('RendererKnowledgeService')
export const getKnowledgeBaseParams = (base: KnowledgeBase): KnowledgeBaseParams => {
const provider = getProviderByModel(base.model)

View File

@ -7293,6 +7293,7 @@ __metadata:
async-mutex: "npm:^0.5.0"
axios: "npm:^1.7.3"
browser-image-compression: "npm:^2.0.2"
chardet: "npm:^2.1.0"
cli-progress: "npm:^3.12.0"
code-inspector-plugin: "npm:^0.20.14"
color: "npm:^5.0.0"
@ -7330,7 +7331,6 @@ __metadata:
iconv-lite: "npm:^0.6.3"
jaison: "npm:^2.0.2"
jest-styled-components: "npm:^7.2.0"
jschardet: "npm:^3.1.4"
jsdom: "npm:26.1.0"
linguist-languages: "npm:^8.0.0"
lint-staged: "npm:^15.5.0"
@ -8593,6 +8593,13 @@ __metadata:
languageName: node
linkType: hard
"chardet@npm:^2.1.0":
version: 2.1.0
resolution: "chardet@npm:2.1.0"
checksum: 10c0/d1b03e47371851ed72741a898281d58f8a9b577aeea6fdfa75a86832898b36c550b3ad057e66d50d774a9cebd9f56c66b6880e4fe75e387794538ba7565b0b6f
languageName: node
linkType: hard
"charenc@npm:0.0.2":
version: 0.0.2
resolution: "charenc@npm:0.0.2"
@ -13350,13 +13357,6 @@ __metadata:
languageName: node
linkType: hard
"jschardet@npm:^3.1.4":
version: 3.1.4
resolution: "jschardet@npm:3.1.4"
checksum: 10c0/d72c724ff60bc185d3962617ffda6849c6d632a935820841078c656a5247d73617a5df3b233e1fb1064de8683f7dae1b422b68186d1d6db22117b59edb5433dc
languageName: node
linkType: hard
"jsdom@npm:26.1.0":
version: 26.1.0
resolution: "jsdom@npm:26.1.0"