diff --git a/packages/shared/config/constant.ts b/packages/shared/config/constant.ts index 82b78459a5..63c223abab 100644 --- a/packages/shared/config/constant.ts +++ b/packages/shared/config/constant.ts @@ -197,6 +197,12 @@ export enum FeedUrl { GITHUB_LATEST = 'https://github.com/CherryHQ/cherry-studio/releases/latest/download' } +export const tesseractLangs = ['chi_sim', 'chi_tra', 'eng'] +export enum TesseractLangsDownloadUrl { + CN = 'https://gitcode.com/tesseract-ocr/tessdata/raw/main/', + GLOBAL = 'https://github.com/tesseract-ocr/tessdata/raw/main/' +} + export enum UpgradeChannel { LATEST = 'latest', // 最新稳定版本 RC = 'rc', // 公测版本 diff --git a/src/main/services/ocr/OcrService.ts b/src/main/services/ocr/OcrService.ts index 5a52ad7833..dd6dd5de1a 100644 --- a/src/main/services/ocr/OcrService.ts +++ b/src/main/services/ocr/OcrService.ts @@ -13,7 +13,7 @@ import { import { statSync } from 'fs' import { readFile } from 'fs/promises' -import { getTesseractWorker } from './tesseract/TesseractService' +import { tesseractService } from './tesseract/TesseractService' const logger = loggerService.withContext('main:OcrService') @@ -25,7 +25,7 @@ const logger = loggerService.withContext('main:OcrService') */ const tesseractOcr = async (file: ImageFileMetadata | string): Promise => { try { - const worker = await getTesseractWorker() + const worker = await tesseractService.getWorker() let ret: Tesseract.RecognizeResult if (typeof file === 'string') { ret = await worker.recognize(file) diff --git a/src/main/services/ocr/tesseract/TesseractService.ts b/src/main/services/ocr/tesseract/TesseractService.ts index 7700324606..8440e48c79 100644 --- a/src/main/services/ocr/tesseract/TesseractService.ts +++ b/src/main/services/ocr/tesseract/TesseractService.ts @@ -1,10 +1,12 @@ import { loggerService } from '@logger' +import { getIpCountry } from '@main/utils/ipService' +import { TesseractLangsDownloadUrl } from '@shared/config/constant' +import { app } from 'electron' +import path from 'path' import Tesseract, { createWorker } from 'tesseract.js' const logger = loggerService.withContext('TesseractService') -let worker: Tesseract.Worker | null = null - // const languageCodeMap: Record = { // 'af-za': 'afr', // 'am-et': 'amh', @@ -110,20 +112,36 @@ let worker: Tesseract.Worker | null = null // 'yi-us': 'yid' // } -export const getTesseractWorker = async (): Promise => { - if (!worker) { - // for now, only support limited languages - worker = await createWorker(['chi_sim', 'chi_tra', 'eng'], undefined, { - // langPath: getCacheDir(), - logger: (m) => logger.debug('From worker', m) - }) +export class TesseractService { + private worker: Tesseract.Worker | null = null + + async getWorker(): Promise { + if (!this.worker) { + // for now, only support limited languages + this.worker = await createWorker(['chi_sim', 'chi_tra', 'eng'], undefined, { + langPath: await this._getLangPath(), + cachePath: this._getCacheDir(), + logger: (m) => logger.debug('From worker', m) + }) + } + return this.worker + } + + private async _getLangPath(): Promise { + const country = await getIpCountry() + return country.toLowerCase() === 'cn' ? TesseractLangsDownloadUrl.CN : TesseractLangsDownloadUrl.GLOBAL + } + + private _getCacheDir(): string { + return path.join(app.getPath('userData'), 'tesseract') + } + + async dispose(): Promise { + if (this.worker) { + await this.worker.terminate() + this.worker = null + } } - return worker } -export const disposeTesseractWorker = async () => { - if (worker) { - await worker.terminate() - worker = null - } -} +export const tesseractService = new TesseractService()