feat: Add PaddleOCR as a new OCR provider (#9876)

* feat: support PaddleOCR as an OCR provider

* style: fix format

* fix: update persistReducer version

* update wrt comments

* fix(ocr): 修复迁移147中OCR提供商的设置错误

将直接赋值改为使用addOcrProvider方法添加内置PaddleOCR提供商,确保正确初始化OCR服务

* Replace bare fetch with net.fetch

* Use '\n' as delimiter

* Optimize code wrt comments

* Add tip

---------

Co-authored-by: icarus <eurfelux@gmail.com>
This commit is contained in:
Lin Manhui 2025-09-04 17:13:58 +08:00 committed by GitHub
parent cac84a8795
commit 7de31d8cb6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 303 additions and 5 deletions

View File

@ -2,6 +2,7 @@ import { loggerService } from '@logger'
import { isLinux } from '@main/constant'
import { BuiltinOcrProviderIds, OcrHandler, OcrProvider, OcrResult, SupportedOcrFile } from '@types'
import { ppocrService } from './builtin/PpocrService'
import { systemOcrService } from './builtin/SystemOcrService'
import { tesseractService } from './builtin/TesseractService'
@ -36,3 +37,5 @@ export const ocrService = new OcrService()
ocrService.register(BuiltinOcrProviderIds.tesseract, tesseractService.ocr.bind(tesseractService))
!isLinux && ocrService.register(BuiltinOcrProviderIds.system, systemOcrService.ocr.bind(systemOcrService))
ocrService.register(BuiltinOcrProviderIds.paddleocr, ppocrService.ocr.bind(ppocrService))

View File

@ -0,0 +1,100 @@
import { loadOcrImage } from '@main/utils/ocr'
import { ImageFileMetadata, isImageFileMetadata, OcrPpocrConfig, OcrResult, SupportedOcrFile } from '@types'
import { net } from 'electron'
import { z } from 'zod'
import { OcrBaseService } from './OcrBaseService'
enum FileType {
PDF = 0,
Image = 1
}
// API Reference: https://www.paddleocr.ai/latest/version3.x/pipeline_usage/OCR.html#3
interface OcrPayload {
file: string
fileType?: FileType | null
useDocOrientationClassify?: boolean | null
useDocUnwarping?: boolean | null
useTextlineOrientation?: boolean | null
textDetLimitSideLen?: number | null
textDetLimitType?: string | null
textDetThresh?: number | null
textDetBoxThresh?: number | null
textDetUnclipRatio?: number | null
textRecScoreThresh?: number | null
visualize?: boolean | null
}
const OcrResponseSchema = z.object({
result: z.object({
ocrResults: z.array(
z.object({
prunedResult: z.object({
rec_texts: z.array(z.string())
})
})
)
})
})
export class PpocrService extends OcrBaseService {
public ocr = async (file: SupportedOcrFile, options?: OcrPpocrConfig): Promise<OcrResult> => {
if (!isImageFileMetadata(file)) {
throw new Error('Only image files are supported currently')
}
if (!options) {
throw new Error('config is required')
}
return this.imageOcr(file, options)
}
private async imageOcr(file: ImageFileMetadata, options: OcrPpocrConfig): Promise<OcrResult> {
if (!options.apiUrl) {
throw new Error('API URL is required')
}
const apiUrl = options.apiUrl
const buffer = await loadOcrImage(file)
const base64 = buffer.toString('base64')
const payload = {
file: base64,
fileType: FileType.Image,
useDocOrientationClassify: false,
useDocUnwarping: false,
visualize: false
} satisfies OcrPayload
const headers: Record<string, string> = {
'Content-Type': 'application/json'
}
if (options.accessToken) {
headers['Authorization'] = `token ${options.accessToken}`
}
try {
const response = await net.fetch(apiUrl, {
method: 'POST',
headers,
body: JSON.stringify(payload)
})
if (!response.ok) {
const text = await response.text()
throw new Error(`OCR service error: ${response.status} ${response.statusText} - ${text}`)
}
const data = await response.json()
const validatedResponse = OcrResponseSchema.parse(data)
const recTexts = validatedResponse.result.ocrResults[0].prunedResult.rec_texts
return { text: recTexts.join('\n') }
} catch (error: any) {
throw new Error(`OCR service error: ${error.message}`)
}
}
}
export const ppocrService = new PpocrService()

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

View File

@ -1,6 +1,7 @@
import {
BuiltinOcrProvider,
BuiltinOcrProviderId,
OcrPpocrProvider,
OcrProviderCapability,
OcrSystemProvider,
OcrTesseractProvider,
@ -37,9 +38,22 @@ const systemOcr: OcrSystemProvider = {
}
} as const satisfies OcrSystemProvider
const ppocrOcr: OcrPpocrProvider = {
id: 'paddleocr',
name: 'PaddleOCR',
config: {
apiUrl: ''
},
capabilities: {
image: true
// pdf: true
}
} as const
export const BUILTIN_OCR_PROVIDERS_MAP = {
tesseract,
system: systemOcr
system: systemOcr,
paddleocr: ppocrOcr
} as const satisfies Record<BuiltinOcrProviderId, BuiltinOcrProvider>
export const BUILTIN_OCR_PROVIDERS: BuiltinOcrProvider[] = Object.values(BUILTIN_OCR_PROVIDERS_MAP)

View File

@ -1,4 +1,5 @@
import { loggerService } from '@logger'
import PaddleocrLogo from '@renderer/assets/images/providers/paddleocr.png'
import TesseractLogo from '@renderer/assets/images/providers/Tesseract.js.png'
import { BUILTIN_OCR_PROVIDERS_MAP, DEFAULT_OCR_PROVIDER } from '@renderer/config/ocr'
import { getBuiltinOcrProviderLabel } from '@renderer/i18n/label'
@ -80,6 +81,8 @@ export const useOcrProviders = () => {
return <Avatar size={size} src={TesseractLogo} />
case 'system':
return <MonitorIcon size={size} />
case 'paddleocr':
return <Avatar size={size} src={PaddleocrLogo} />
}
}
return <FileQuestionMarkIcon size={size} />

View File

@ -327,10 +327,12 @@ export const getBuiltInMcpServerDescriptionLabel = (key: string): string => {
const builtinOcrProviderKeyMap = {
system: 'ocr.builtin.system',
tesseract: ''
tesseract: '',
paddleocr: ''
} as const satisfies Record<BuiltinOcrProviderId, string>
export const getBuiltinOcrProviderLabel = (key: BuiltinOcrProviderId) => {
if (key === 'tesseract') return 'Tesseract'
else if (key == 'paddleocr') return 'PaddleOCR'
else return getLabel(builtinOcrProviderKeyMap, key)
}

View File

@ -3884,6 +3884,13 @@
"title": "Image"
},
"image_provider": "OCR service provider",
"paddleocr": {
"aistudio_access_token": "Access token of AI Studio Community",
"aistudio_url_label": "AI Studio Community",
"api_url": "API URL",
"serving_doc_url_label": "PaddleOCR Serving Documentation",
"tip": "You can refer to the official PaddleOCR documentation to deploy a local service, or deploy a cloud service on the PaddlePaddle AI Studio Community. For the latter case, please provide the access token of the AI Studio Community."
},
"system": {
"win": {
"langs_tooltip": "Dependent on Windows to provide services, you need to download language packs in the system to support the relevant languages."

View File

@ -3884,6 +3884,13 @@
"title": "画像"
},
"image_provider": "OCRサービスプロバイダー",
"paddleocr": {
"aistudio_access_token": "AI Studio Community のアクセス・トークン",
"aistudio_url_label": "AI Studio Community",
"api_url": "API URL",
"serving_doc_url_label": "PaddleOCR サービング ドキュメント",
"tip": "ローカルサービスをデプロイするには、公式の PaddleOCR ドキュメントを参照するか、PaddlePaddle AI Studio コミュニティ上でクラウドサービスをデプロイすることができます。後者の場合は、AI Studio コミュニティのアクセストークンを提供してください。"
},
"system": {
"win": {
"langs_tooltip": "Windows が提供するサービスに依存しており、関連する言語をサポートするには、システムで言語パックをダウンロードする必要があります。"

View File

@ -3884,6 +3884,13 @@
"title": "Изображение"
},
"image_provider": "Поставщик услуг OCR",
"paddleocr": {
"aistudio_access_token": "Токен доступа сообщества AI Studio",
"aistudio_url_label": "Сообщество AI Studio",
"api_url": "URL API",
"serving_doc_url_label": "Документация по PaddleOCR Serving",
"tip": "Вы можете обратиться к официальной документации PaddleOCR, чтобы развернуть локальный сервис, либо развернуть облачный сервис в сообществе PaddlePaddle AI Studio. В последнем случае, пожалуйста, предоставьте токен доступа сообщества AI Studio."
},
"system": {
"win": {
"langs_tooltip": "Для предоставления служб Windows необходимо загрузить языковой пакет в системе для поддержки соответствующего языка."

View File

@ -3884,6 +3884,13 @@
"title": "图片"
},
"image_provider": "OCR 服务提供商",
"paddleocr": {
"aistudio_access_token": "星河社区访问令牌",
"aistudio_url_label": "星河社区",
"api_url": "API URL",
"serving_doc_url_label": "PaddleOCR 服务化部署文档",
"tip": "您可以参考 PaddleOCR 官方文档部署本地服务,或者在飞桨星河社区部署云服务。对于后一种情况,请填写星河社区访问令牌。"
},
"system": {
"win": {
"langs_tooltip": "依赖 Windows 提供服务,您需要在系统中下载语言包来支持相关语言。"

View File

@ -3884,6 +3884,13 @@
"title": "圖片"
},
"image_provider": "OCR 服務提供商",
"paddleocr": {
"aistudio_access_token": "星河社群存取權杖",
"aistudio_url_label": "星河社群",
"api_url": "API 網址",
"serving_doc_url_label": "PaddleOCR 服務化部署文件",
"tip": "您可以參考 PaddleOCR 官方文件來部署本機服務,或是在飛槳星河社群部署雲端服務。對於後者,請提供星河社群的存取權杖。"
},
"system": {
"win": {
"langs_tooltip": "依賴 Windows 提供服務,您需要在系統中下載語言包來支援相關語言。"

View File

@ -3884,6 +3884,13 @@
"title": "Εικόνα"
},
"image_provider": "Πάροχοι υπηρεσιών OCR",
"paddleocr": {
"aistudio_access_token": "Διακριτικό πρόσβασης της κοινότητας AI Studio",
"aistudio_url_label": "Κοινότητα AI Studio",
"api_url": "Διεύθυνση URL API",
"serving_doc_url_label": "Τεκμηρίωση PaddleOCR Serving",
"tip": "Μπορείτε να ανατρέξετε στην επίσημη τεκμηρίωση του PaddleOCR για να αναπτύξετε μια τοπική υπηρεσία, ή να αναπτύξετε μια υπηρεσία στο cloud στην Κοινότητα PaddlePaddle AI Studio. Στη δεύτερη περίπτωση, παρακαλώ παρέχετε το διακριτικό πρόσβασης (access token) της Κοινότητας AI Studio."
},
"system": {
"win": {
"langs_tooltip": "Εξαρτάται από τα Windows για την παροχή υπηρεσιών, πρέπει να κατεβάσετε το πακέτο γλώσσας στο σύστημα για να υποστηρίξετε τις σχετικές γλώσσες."

View File

@ -3884,6 +3884,13 @@
"title": "Imagen"
},
"image_provider": "Proveedor de servicios OCR",
"paddleocr": {
"aistudio_access_token": "Token de acceso de la comunidad de AI Studio",
"aistudio_url_label": "Comunidad de AI Studio",
"api_url": "URL de la API",
"serving_doc_url_label": "Documentación de PaddleOCR Serving",
"tip": "Puede consultar la documentación oficial de PaddleOCR para implementar un servicio local, o implementar un servicio en la nube en la Comunidad de PaddlePaddle AI Studio. En este último caso, proporcione el token de acceso de la Comunidad de AI Studio."
},
"system": {
"win": {
"langs_tooltip": "Dependiendo de Windows para proporcionar servicios, necesita descargar el paquete de idioma en el sistema para admitir los idiomas correspondientes."

View File

@ -3884,6 +3884,13 @@
"title": "Image"
},
"image_provider": "Fournisseur de service OCR",
"paddleocr": {
"aistudio_access_token": "Jeton daccès de la communauté AI Studio",
"aistudio_url_label": "Communauté AI Studio",
"api_url": "URL de lAPI",
"serving_doc_url_label": "Documentation de PaddleOCR Serving",
"tip": "Vous pouvez consulter la documentation officielle de PaddleOCR pour déployer un service local, ou déployer un service cloud sur la Communauté PaddlePaddle AI Studio. Dans ce dernier cas, veuillez fournir le jeton daccès de la Communauté AI Studio."
},
"system": {
"win": {
"langs_tooltip": "Dépendre de Windows pour fournir des services, vous devez télécharger des packs linguistiques dans le système afin de prendre en charge les langues concernées."

View File

@ -3884,6 +3884,13 @@
"title": "Imagem"
},
"image_provider": "Provedor de serviços OCR",
"paddleocr": {
"aistudio_access_token": "Token de acesso da comunidade AI Studio",
"aistudio_url_label": "Comunidade AI Studio",
"api_url": "URL da API",
"serving_doc_url_label": "Documentação do PaddleOCR Serving",
"tip": "Você pode consultar a documentação oficial do PaddleOCR para implantar um serviço local ou implantar um serviço na nuvem na Comunidade PaddlePaddle AI Studio. No último caso, forneça o token de acesso da Comunidade AI Studio."
},
"system": {
"win": {
"langs_tooltip": "Dependendo do Windows para fornecer serviços, você precisa baixar pacotes de idiomas no sistema para dar suporte aos idiomas relevantes."

View File

@ -0,0 +1,83 @@
import { ErrorBoundary } from '@renderer/components/ErrorBoundary'
import { useOcrProvider } from '@renderer/hooks/useOcrProvider'
import { BuiltinOcrProviderIds, isOcrPpocrProvider } from '@renderer/types'
import { Input } from 'antd'
import { startTransition, useCallback, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { SettingHelpLink, SettingHelpText, SettingHelpTextRow, SettingRow, SettingRowTitle } from '..'
export const OcrPpocrSettings = () => {
// Hack: Hard-coded for now
const SERVING_DOC_URL = 'https://www.paddleocr.ai/latest/version3.x/deployment/serving.html'
const AISTUDIO_URL = 'https://aistudio.baidu.com/pipeline/mine'
const { t } = useTranslation()
const { provider, updateConfig } = useOcrProvider(BuiltinOcrProviderIds.paddleocr)
if (!isOcrPpocrProvider(provider)) {
throw new Error('Not PaddleOCR provider.')
}
const [apiUrl, setApiUrl] = useState<string>(provider.config.apiUrl || '')
const [accessToken, setAccessToken] = useState<string>(provider.config.accessToken || '')
const onApiUrlChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
const value = e.target.value
startTransition(() => {
setApiUrl(value)
})
}, [])
const onAccessTokenChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
const value = e.target.value
startTransition(() => {
setAccessToken(value)
})
}, [])
const onBlur = useCallback(() => {
updateConfig({
apiUrl,
accessToken
})
}, [apiUrl, accessToken, updateConfig])
return (
<ErrorBoundary>
<SettingRow style={{ marginBottom: 10 }}>
<SettingRowTitle style={{ width: 150 }}>{t('settings.tool.ocr.paddleocr.api_url')}</SettingRowTitle>
<Input
value={apiUrl}
onChange={onApiUrlChange}
onBlur={onBlur}
placeholder={t('settings.tool.ocr.paddleocr.api_url')}
/>
</SettingRow>
<SettingRow style={{ marginBottom: 10 }}>
<SettingRowTitle style={{ width: 150 }}>
{t('settings.tool.ocr.paddleocr.aistudio_access_token')}
</SettingRowTitle>
<Input.Password
value={accessToken}
onChange={onAccessTokenChange}
onBlur={onBlur}
placeholder={t('settings.tool.ocr.paddleocr.aistudio_access_token')}
spellCheck={false}
/>
</SettingRow>
<SettingHelpTextRow style={{ display: 'flex', flexDirection: 'column' }}>
<SettingHelpText style={{ marginBottom: 5 }}>{t('settings.tool.ocr.paddleocr.tip')}</SettingHelpText>
<div style={{ display: 'flex', gap: 12 }}>
<SettingHelpLink target="_blank" href={SERVING_DOC_URL}>
{t('settings.tool.ocr.paddleocr.serving_doc_url_label')}
</SettingHelpLink>
<SettingHelpLink target="_blank" href={AISTUDIO_URL}>
{t('settings.tool.ocr.paddleocr.aistudio_url_label')}
</SettingHelpLink>
</div>
</SettingHelpTextRow>
</ErrorBoundary>
)
}

View File

@ -8,6 +8,7 @@ import { Divider, Flex } from 'antd'
import styled from 'styled-components'
import { SettingGroup, SettingTitle } from '..'
import { OcrPpocrSettings } from './OcrPpocrSettings'
import { OcrSystemSettings } from './OcrSystemSettings'
import { OcrTesseractSettings } from './OcrTesseractSettings'
@ -32,6 +33,8 @@ const OcrProviderSettings = ({ provider }: Props) => {
return <OcrTesseractSettings />
case 'system':
return <OcrSystemSettings />
case 'paddleocr':
return <OcrPpocrSettings />
default:
return null
}

View File

@ -67,7 +67,7 @@ const persistedReducer = persistReducer(
{
key: 'cherry-studio',
storage,
version: 147,
version: 148,
blacklist: ['runtime', 'messages', 'messageBlocks', 'tabs'],
migrate
},

View File

@ -2383,6 +2383,15 @@ const migrateConfig = {
logger.error('migrate 147 error', error as Error)
return state
}
},
'148': (state: RootState) => {
try {
addOcrProvider(state, BUILTIN_OCR_PROVIDERS_MAP.paddleocr)
return state
} catch (error) {
logger.error('migrate 148 error', error as Error)
return state
}
}
}

View File

@ -4,7 +4,8 @@ import { FileMetadata, ImageFileMetadata, isImageFileMetadata, TranslateLanguage
export const BuiltinOcrProviderIds = {
tesseract: 'tesseract',
system: 'system'
system: 'system',
paddleocr: 'paddleocr'
} as const
export type BuiltinOcrProviderId = keyof typeof BuiltinOcrProviderIds
@ -74,7 +75,7 @@ export type OcrProviderBaseConfig = {
enabled?: boolean
}
export type OcrProviderConfig = OcrApiProviderConfig | OcrTesseractConfig | OcrSystemConfig
export type OcrProviderConfig = OcrApiProviderConfig | OcrTesseractConfig | OcrSystemConfig | OcrPpocrConfig
export type OcrProvider = {
id: string
@ -170,3 +171,20 @@ export type OcrSystemProvider = {
export const isOcrSystemProvider = (p: OcrProvider): p is OcrSystemProvider => {
return p.id === BuiltinOcrProviderIds.system
}
// PaddleOCR Types
export type OcrPpocrConfig = OcrProviderBaseConfig & {
apiUrl?: string
accessToken?: string
}
export type OcrPpocrProvider = {
id: 'paddleocr'
config: OcrPpocrConfig
} & ImageOcrProvider &
// PdfOcrProvider &
BuiltinOcrProvider
export const isOcrPpocrProvider = (p: OcrProvider): p is OcrPpocrProvider => {
return p.id === BuiltinOcrProviderIds.paddleocr
}