feat(ocr): enhance Tesseract service with language support and worker management

- Added support for multiple Tesseract languages: Chinese (Simplified and Traditional) and English.
- Refactored Tesseract worker management into a class for better encapsulation and reuse.
- Introduced methods to dynamically determine language path based on IP country and manage worker lifecycle.
This commit is contained in:
beyondkmp 2025-08-23 08:44:42 +08:00
parent dbf35b79ad
commit 9acfb5fb9e
3 changed files with 42 additions and 18 deletions

View File

@ -197,6 +197,12 @@ export enum FeedUrl {
GITHUB_LATEST = 'https://github.com/CherryHQ/cherry-studio/releases/latest/download' GITHUB_LATEST = 'https://github.com/CherryHQ/cherry-studio/releases/latest/download'
} }
export const tesseractLangs = ['chi_sim', 'chi_tra', 'eng']
export enum TesseractLangsDownloadUrl {
CN = 'https://gitcode.com/tesseract-ocr/tessdata/raw/main/',
GLOBAL = 'https://github.com/tesseract-ocr/tessdata/raw/main/'
}
export enum UpgradeChannel { export enum UpgradeChannel {
LATEST = 'latest', // 最新稳定版本 LATEST = 'latest', // 最新稳定版本
RC = 'rc', // 公测版本 RC = 'rc', // 公测版本

View File

@ -13,7 +13,7 @@ import {
import { statSync } from 'fs' import { statSync } from 'fs'
import { readFile } from 'fs/promises' import { readFile } from 'fs/promises'
import { getTesseractWorker } from './tesseract/TesseractService' import { tesseractService } from './tesseract/TesseractService'
const logger = loggerService.withContext('main:OcrService') const logger = loggerService.withContext('main:OcrService')
@ -25,7 +25,7 @@ const logger = loggerService.withContext('main:OcrService')
*/ */
const tesseractOcr = async (file: ImageFileMetadata | string): Promise<Tesseract.RecognizeResult> => { const tesseractOcr = async (file: ImageFileMetadata | string): Promise<Tesseract.RecognizeResult> => {
try { try {
const worker = await getTesseractWorker() const worker = await tesseractService.getWorker()
let ret: Tesseract.RecognizeResult let ret: Tesseract.RecognizeResult
if (typeof file === 'string') { if (typeof file === 'string') {
ret = await worker.recognize(file) ret = await worker.recognize(file)

View File

@ -1,10 +1,12 @@
import { loggerService } from '@logger' import { loggerService } from '@logger'
import { getIpCountry } from '@main/utils/ipService'
import { TesseractLangsDownloadUrl } from '@shared/config/constant'
import { app } from 'electron'
import path from 'path'
import Tesseract, { createWorker } from 'tesseract.js' import Tesseract, { createWorker } from 'tesseract.js'
const logger = loggerService.withContext('TesseractService') const logger = loggerService.withContext('TesseractService')
let worker: Tesseract.Worker | null = null
// const languageCodeMap: Record<string, string> = { // const languageCodeMap: Record<string, string> = {
// 'af-za': 'afr', // 'af-za': 'afr',
// 'am-et': 'amh', // 'am-et': 'amh',
@ -110,20 +112,36 @@ let worker: Tesseract.Worker | null = null
// 'yi-us': 'yid' // 'yi-us': 'yid'
// } // }
export const getTesseractWorker = async (): Promise<Tesseract.Worker> => { export class TesseractService {
if (!worker) { private worker: Tesseract.Worker | null = null
// for now, only support limited languages
worker = await createWorker(['chi_sim', 'chi_tra', 'eng'], undefined, { async getWorker(): Promise<Tesseract.Worker> {
// langPath: getCacheDir(), if (!this.worker) {
logger: (m) => logger.debug('From worker', m) // for now, only support limited languages
}) this.worker = await createWorker(['chi_sim', 'chi_tra', 'eng'], undefined, {
langPath: await this._getLangPath(),
cachePath: this._getCacheDir(),
logger: (m) => logger.debug('From worker', m)
})
}
return this.worker
}
private async _getLangPath(): Promise<string> {
const country = await getIpCountry()
return country.toLowerCase() === 'cn' ? TesseractLangsDownloadUrl.CN : TesseractLangsDownloadUrl.GLOBAL
}
private _getCacheDir(): string {
return path.join(app.getPath('userData'), 'tesseract')
}
async dispose(): Promise<void> {
if (this.worker) {
await this.worker.terminate()
this.worker = null
}
} }
return worker
} }
export const disposeTesseractWorker = async () => { export const tesseractService = new TesseractService()
if (worker) {
await worker.terminate()
worker = null
}
}