mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2026-01-06 05:09:09 +08:00
feat(ocr): enhance Tesseract service with language support and worker management
- Added support for multiple Tesseract languages: Chinese (Simplified and Traditional) and English. - Refactored Tesseract worker management into a class for better encapsulation and reuse. - Introduced methods to dynamically determine language path based on IP country and manage worker lifecycle.
This commit is contained in:
parent
dbf35b79ad
commit
9acfb5fb9e
@ -197,6 +197,12 @@ export enum FeedUrl {
|
|||||||
GITHUB_LATEST = 'https://github.com/CherryHQ/cherry-studio/releases/latest/download'
|
GITHUB_LATEST = 'https://github.com/CherryHQ/cherry-studio/releases/latest/download'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const tesseractLangs = ['chi_sim', 'chi_tra', 'eng']
|
||||||
|
export enum TesseractLangsDownloadUrl {
|
||||||
|
CN = 'https://gitcode.com/tesseract-ocr/tessdata/raw/main/',
|
||||||
|
GLOBAL = 'https://github.com/tesseract-ocr/tessdata/raw/main/'
|
||||||
|
}
|
||||||
|
|
||||||
export enum UpgradeChannel {
|
export enum UpgradeChannel {
|
||||||
LATEST = 'latest', // 最新稳定版本
|
LATEST = 'latest', // 最新稳定版本
|
||||||
RC = 'rc', // 公测版本
|
RC = 'rc', // 公测版本
|
||||||
|
|||||||
@ -13,7 +13,7 @@ import {
|
|||||||
import { statSync } from 'fs'
|
import { statSync } from 'fs'
|
||||||
import { readFile } from 'fs/promises'
|
import { readFile } from 'fs/promises'
|
||||||
|
|
||||||
import { getTesseractWorker } from './tesseract/TesseractService'
|
import { tesseractService } from './tesseract/TesseractService'
|
||||||
|
|
||||||
const logger = loggerService.withContext('main:OcrService')
|
const logger = loggerService.withContext('main:OcrService')
|
||||||
|
|
||||||
@ -25,7 +25,7 @@ const logger = loggerService.withContext('main:OcrService')
|
|||||||
*/
|
*/
|
||||||
const tesseractOcr = async (file: ImageFileMetadata | string): Promise<Tesseract.RecognizeResult> => {
|
const tesseractOcr = async (file: ImageFileMetadata | string): Promise<Tesseract.RecognizeResult> => {
|
||||||
try {
|
try {
|
||||||
const worker = await getTesseractWorker()
|
const worker = await tesseractService.getWorker()
|
||||||
let ret: Tesseract.RecognizeResult
|
let ret: Tesseract.RecognizeResult
|
||||||
if (typeof file === 'string') {
|
if (typeof file === 'string') {
|
||||||
ret = await worker.recognize(file)
|
ret = await worker.recognize(file)
|
||||||
|
|||||||
@ -1,10 +1,12 @@
|
|||||||
import { loggerService } from '@logger'
|
import { loggerService } from '@logger'
|
||||||
|
import { getIpCountry } from '@main/utils/ipService'
|
||||||
|
import { TesseractLangsDownloadUrl } from '@shared/config/constant'
|
||||||
|
import { app } from 'electron'
|
||||||
|
import path from 'path'
|
||||||
import Tesseract, { createWorker } from 'tesseract.js'
|
import Tesseract, { createWorker } from 'tesseract.js'
|
||||||
|
|
||||||
const logger = loggerService.withContext('TesseractService')
|
const logger = loggerService.withContext('TesseractService')
|
||||||
|
|
||||||
let worker: Tesseract.Worker | null = null
|
|
||||||
|
|
||||||
// const languageCodeMap: Record<string, string> = {
|
// const languageCodeMap: Record<string, string> = {
|
||||||
// 'af-za': 'afr',
|
// 'af-za': 'afr',
|
||||||
// 'am-et': 'amh',
|
// 'am-et': 'amh',
|
||||||
@ -110,20 +112,36 @@ let worker: Tesseract.Worker | null = null
|
|||||||
// 'yi-us': 'yid'
|
// 'yi-us': 'yid'
|
||||||
// }
|
// }
|
||||||
|
|
||||||
export const getTesseractWorker = async (): Promise<Tesseract.Worker> => {
|
export class TesseractService {
|
||||||
if (!worker) {
|
private worker: Tesseract.Worker | null = null
|
||||||
// for now, only support limited languages
|
|
||||||
worker = await createWorker(['chi_sim', 'chi_tra', 'eng'], undefined, {
|
async getWorker(): Promise<Tesseract.Worker> {
|
||||||
// langPath: getCacheDir(),
|
if (!this.worker) {
|
||||||
logger: (m) => logger.debug('From worker', m)
|
// for now, only support limited languages
|
||||||
})
|
this.worker = await createWorker(['chi_sim', 'chi_tra', 'eng'], undefined, {
|
||||||
|
langPath: await this._getLangPath(),
|
||||||
|
cachePath: this._getCacheDir(),
|
||||||
|
logger: (m) => logger.debug('From worker', m)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return this.worker
|
||||||
|
}
|
||||||
|
|
||||||
|
private async _getLangPath(): Promise<string> {
|
||||||
|
const country = await getIpCountry()
|
||||||
|
return country.toLowerCase() === 'cn' ? TesseractLangsDownloadUrl.CN : TesseractLangsDownloadUrl.GLOBAL
|
||||||
|
}
|
||||||
|
|
||||||
|
private _getCacheDir(): string {
|
||||||
|
return path.join(app.getPath('userData'), 'tesseract')
|
||||||
|
}
|
||||||
|
|
||||||
|
async dispose(): Promise<void> {
|
||||||
|
if (this.worker) {
|
||||||
|
await this.worker.terminate()
|
||||||
|
this.worker = null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return worker
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export const disposeTesseractWorker = async () => {
|
export const tesseractService = new TesseractService()
|
||||||
if (worker) {
|
|
||||||
await worker.terminate()
|
|
||||||
worker = null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user