From bfb64522cdfef955e768e9d4aeff1f44e1fa05da Mon Sep 17 00:00:00 2001 From: icarus Date: Sat, 23 Aug 2025 13:26:09 +0800 Subject: [PATCH] =?UTF-8?q?refactor(ocr):=20=E5=B0=86Tesseract=E7=9B=B8?= =?UTF-8?q?=E5=85=B3=E9=85=8D=E7=BD=AE=E7=A7=BB=E8=87=B3=E6=9C=8D=E5=8A=A1?= =?UTF-8?q?=E5=86=85=E9=83=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将语言列表和下载URL常量从共享配置移至Tesseract服务内部 使用常量定义图片大小阈值以提高可读性 --- packages/shared/config/constant.ts | 6 ------ .../services/ocr/tesseract/TesseractService.ts | 16 ++++++++++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/packages/shared/config/constant.ts b/packages/shared/config/constant.ts index 9240bb73f5..82b78459a5 100644 --- a/packages/shared/config/constant.ts +++ b/packages/shared/config/constant.ts @@ -197,12 +197,6 @@ export enum FeedUrl { GITHUB_LATEST = 'https://github.com/CherryHQ/cherry-studio/releases/latest/download' } -export const tesseractLangs = ['chi_sim', 'chi_tra', 'eng'] -export enum TesseractLangsDownloadUrl { - CN = 'https://gitcode.com/beyondkmp/tessdata/releases/download/4.1.0/', - GLOBAL = 'https://github.com/tesseract-ocr/tessdata/raw/main/' -} - export enum UpgradeChannel { LATEST = 'latest', // 最新稳定版本 RC = 'rc', // 公测版本 diff --git a/src/main/services/ocr/tesseract/TesseractService.ts b/src/main/services/ocr/tesseract/TesseractService.ts index 2472f02fba..06f460dfea 100644 --- a/src/main/services/ocr/tesseract/TesseractService.ts +++ b/src/main/services/ocr/tesseract/TesseractService.ts @@ -1,6 +1,6 @@ import { loggerService } from '@logger' import { getIpCountry } from '@main/utils/ipService' -import { MB, TesseractLangsDownloadUrl } from '@shared/config/constant' +import { MB } from '@shared/config/constant' import { FileMetadata, ImageFileMetadata, isImageFile, OcrResult } from '@types' import { app } from 'electron' import fs from 'fs' @@ -114,13 +114,21 @@ const logger = loggerService.withContext('TesseractService') // 'yi-us': 'yid' // } +// config +const MB_SIZE_THRESHOLD = 50 +const tesseractLangs = ['chi_sim', 'chi_tra', 'eng'] +enum TesseractLangsDownloadUrl { + CN = 'https://gitcode.com/beyondkmp/tessdata/releases/download/4.1.0/', + GLOBAL = 'https://github.com/tesseract-ocr/tessdata/raw/main/' +} + export class TesseractService { private worker: Tesseract.Worker | null = null async getWorker(): Promise { if (!this.worker) { // for now, only support limited languages - this.worker = await createWorker(['chi_sim', 'chi_tra', 'eng'], undefined, { + this.worker = await createWorker(tesseractLangs, undefined, { langPath: await this._getLangPath(), cachePath: await this._getCacheDir(), gzip: false, @@ -133,8 +141,8 @@ export class TesseractService { async imageOcr(file: ImageFileMetadata): Promise { const worker = await this.getWorker() const stat = await fs.promises.stat(file.path) - if (stat.size > 50 * MB) { - throw new Error('This image is too large (max 50MB)') + if (stat.size > MB_SIZE_THRESHOLD * MB) { + throw new Error(`This image is too large (max ${MB_SIZE_THRESHOLD}MB)`) } const buffer = await fs.promises.readFile(file.path) const result = await worker.recognize(buffer)