From 4fd3300ed03620712d2a1fa7fb7a50768acb20b2 Mon Sep 17 00:00:00 2001 From: icarus Date: Mon, 20 Oct 2025 19:35:39 +0800 Subject: [PATCH] refactor(ocr): restructure ocr service and repository layers - Extract database operations to new OcrProviderRepository - Improve service initialization and provider management - Add better error handling and logging - Update API handlers to use new service methods --- src/main/data/api/handlers/index.ts | 6 +- .../repositories/OcrProviderRepository.ts | 260 ++++++++++++ src/main/services/ocr/OcrService.ts | 386 +++++++++++------- 3 files changed, 507 insertions(+), 145 deletions(-) create mode 100644 src/main/data/repositories/OcrProviderRepository.ts diff --git a/src/main/data/api/handlers/index.ts b/src/main/data/api/handlers/index.ts index 70fac91cbb..5bba22e99b 100644 --- a/src/main/data/api/handlers/index.ts +++ b/src/main/data/api/handlers/index.ts @@ -213,7 +213,7 @@ export const apiHandlers: ApiImplementation = { '/ocr/providers': { GET: async ({ query }) => { - return ocrService.listProviders(query.registered) + return ocrService.listProviders(query) }, POST: async ({ body }) => { return ocrService.createProvider(body) @@ -228,13 +228,13 @@ export const apiHandlers: ApiImplementation = { if (params.id !== body.id) { throw new Error('Provider ID in path does not match ID in body') } - return ocrService.patchProvider(body) + return ocrService.updateProvider(params.id, body) }, PUT: async ({ params, body }) => { if (params.id !== body.id) { throw new Error('Provider ID in path does not match ID in body') } - return ocrService.putProvider(body) + return ocrService.replaceProvider(body) }, DELETE: async ({ params }) => { return ocrService.deleteProvider(params.id) diff --git a/src/main/data/repositories/OcrProviderRepository.ts b/src/main/data/repositories/OcrProviderRepository.ts new file mode 100644 index 0000000000..df1abd515e --- /dev/null +++ b/src/main/data/repositories/OcrProviderRepository.ts @@ -0,0 +1,260 @@ +import { dbService } from '@data/db/DbService' +import { ocrProviderTable } from '@data/db/schemas/ocr/provider' +import { loggerService } from '@logger' +import type { + CreateOcrProviderRequest, + CreateOcrProviderResponse, + DbOcrProvider, + ListOcrProvidersQuery, + ListOcrProvidersResponse, + OcrProviderId, + PatchOcrProviderRequest, + PatchOcrProviderResponse, + PutOcrProviderRequest, + PutOcrProviderResponse +} from '@types' +import { BuiltinOcrProviderIds, isDbOcrProvider } from '@types' +import dayjs from 'dayjs' +import { eq } from 'drizzle-orm' +import { merge } from 'lodash' + +const logger = loggerService.withContext('OcrProviderRepository') + +/** + * Data access layer for OCR providers + * Handles all database operations and data validation + */ +export class OcrProviderRepository { + /** + * Get all OCR providers + */ + public async findAll(query?: ListOcrProvidersQuery): Promise { + try { + const providers = await dbService.getDb().select().from(ocrProviderTable) + + if (query?.registered) { + // Filter by registered providers (this would need to be implemented) + // For now, return all providers + return { data: providers } + } + + return { data: providers } + } catch (error) { + logger.error('Failed to find all OCR providers', error as Error) + throw error + } + } + + /** + * Get OCR provider by ID + */ + public async findById(id: OcrProviderId): Promise { + try { + const providers = await dbService + .getDb() + .select() + .from(ocrProviderTable) + .where(eq(ocrProviderTable.id, id)) + .limit(1) + + if (providers.length === 0) { + throw new Error(`OCR provider ${id} not found`) + } + + return providers[0] + } catch (error) { + logger.error(`Failed to find OCR provider ${id}`, error as Error) + throw error + } + } + + /** + * Check if provider exists + */ + public async exists(id: OcrProviderId): Promise { + try { + const providers = await dbService + .getDb() + .select({ id: ocrProviderTable.id }) + .from(ocrProviderTable) + .where(eq(ocrProviderTable.id, id)) + .limit(1) + + return providers.length > 0 + } catch (error) { + logger.error(`Failed to check if OCR provider ${id} exists`, error as Error) + throw error + } + } + + /** + * Create new OCR provider + */ + public async create(data: CreateOcrProviderRequest): Promise { + try { + // Check if provider already exists + if (await this.exists(data.id)) { + throw new Error(`OCR provider ${data.id} already exists`) + } + + const timestamp = dayjs().valueOf() + const newProvider = { + ...data, + createdAt: timestamp, + updatedAt: timestamp + } satisfies DbOcrProvider + + // Validate data structure + if (!isDbOcrProvider(newProvider)) { + throw new Error('Invalid OCR provider data') + } + + const [created] = await dbService.getDb().insert(ocrProviderTable).values(newProvider).returning() + + logger.info(`Created OCR provider: ${data.id}`) + return { data: created } + } catch (error) { + logger.error(`Failed to create OCR provider ${data.id}`, error as Error) + throw error + } + } + + /** + * Update OCR provider (partial update) + */ + public async update(id: OcrProviderId, data: Partial): Promise { + try { + const existing = await this.findById(id) + + const newProvider = { + ...merge({}, existing, data), + updatedAt: dayjs().valueOf() + } satisfies DbOcrProvider + + // Validate data structure + if (!isDbOcrProvider(newProvider)) { + throw new Error('Invalid OCR provider data') + } + + const [updated] = await dbService + .getDb() + .update(ocrProviderTable) + .set(newProvider) + .where(eq(ocrProviderTable.id, id)) + .returning() + + logger.info(`Updated OCR provider: ${id}`) + return { data: updated } + } catch (error) { + logger.error(`Failed to update OCR provider ${id}`, error as Error) + throw error + } + } + + /** + * Replace OCR provider (full update) + */ + public async replace(data: PutOcrProviderRequest): Promise { + try { + // Check if it's a built-in provider + if (BuiltinOcrProviderIds.some((pid) => pid === data.id)) { + throw new Error('Built-in OCR providers cannot be modified with PUT method.') + } + + const timestamp = dayjs().valueOf() + const existing = await this.exists(data.id) + + let newProvider: DbOcrProvider + + if (existing) { + // Update existing + const current = await this.findById(data.id) + newProvider = { + ...data, + updatedAt: timestamp, + createdAt: current.createdAt + } + } else { + // Create new + newProvider = { + ...data, + createdAt: timestamp, + updatedAt: timestamp + } + } + + // Validate data structure + if (!isDbOcrProvider(newProvider)) { + throw new Error('Invalid OCR provider data') + } + + const [saved] = await dbService + .getDb() + .insert(ocrProviderTable) + .values(newProvider) + .onConflictDoUpdate({ + target: ocrProviderTable.id, + set: newProvider + }) + .returning() + + logger.info(`Replaced OCR provider: ${data.id}`) + return { data: saved } + } catch (error) { + logger.error(`Failed to replace OCR provider ${data.id}`, error as Error) + throw error + } + } + + /** + * Delete OCR provider + */ + public async delete(id: OcrProviderId): Promise { + try { + // Check if it's a built-in provider + if (BuiltinOcrProviderIds.some((pid) => pid === id)) { + throw new Error('Built-in OCR providers cannot be deleted.') + } + + // Check if provider exists + await this.findById(id) + + await dbService.getDb().delete(ocrProviderTable).where(eq(ocrProviderTable.id, id)) + + logger.info(`Deleted OCR provider: ${id}`) + } catch (error) { + logger.error(`Failed to delete OCR provider ${id}`, error as Error) + throw error + } + } + + /** + * Initialize built-in providers in database + */ + public async initializeBuiltInProviders(): Promise { + try { + // Import built-in provider configurations + const { BUILTIN_OCR_PROVIDERS } = await import('@shared/config/ocr') + + logger.info('Initializing built-in OCR providers') + + // Check and create each built-in provider if it doesn't exist + for (const provider of BUILTIN_OCR_PROVIDERS) { + const exists = await this.exists(provider.id) + if (!exists) { + logger.info(`Creating built-in OCR provider: ${provider.id}`) + await this.create(provider) + } else { + logger.debug(`Built-in OCR provider already exists: ${provider.id}`) + } + } + + logger.info(`Initialized ${BUILTIN_OCR_PROVIDERS.length} built-in OCR providers`) + } catch (error) { + logger.error('Failed to initialize built-in OCR providers', error as Error) + throw error + } + } +} + +export const ocrProviderRepository = new OcrProviderRepository() diff --git a/src/main/services/ocr/OcrService.ts b/src/main/services/ocr/OcrService.ts index 7dac466149..e0f34c04c2 100644 --- a/src/main/services/ocr/OcrService.ts +++ b/src/main/services/ocr/OcrService.ts @@ -1,12 +1,14 @@ -import { dbService } from '@data/db/DbService' -import { ocrProviderTable } from '@data/db/schemas/ocr/provider' import { loggerService } from '@logger' +import { ocrProviderRepository } from '@main/data/repositories/OcrProviderRepository' import type { CreateOcrProviderRequest, CreateOcrProviderResponse, DbOcrProvider, + ListOcrProvidersQuery, ListOcrProvidersResponse, OcrParams, + OcrProvider, + OcrProviderId, OcrResult, PatchOcrProviderRequest, PatchOcrProviderResponse, @@ -14,10 +16,7 @@ import type { PutOcrProviderResponse, SupportedOcrFile } from '@types' -import { BuiltinOcrProviderIdMap, BuiltinOcrProviderIds, isDbOcrProvider } from '@types' -import dayjs from 'dayjs' -import { eq } from 'drizzle-orm' -import { merge } from 'lodash' +import { BuiltinOcrProviderIdMap } from '@types' import type { OcrBaseService } from './builtin/OcrBaseService' import { ovOcrService } from './builtin/OvOcrService' @@ -27,12 +26,47 @@ import { tesseractService } from './builtin/TesseractService' const logger = loggerService.withContext('OcrService') -export class OcrService { - private registry: Map = new Map() +/** + * Business logic layer for OCR operations + * Handles OCR provider registration, orchestration, and core OCR functionality + */ +class OcrService { + private registry: Map = new Map() + private initialized: boolean = false constructor() { - // TODO: Ensure builtin providers are in db. - // Register built-in providers + this.registerBuiltinProviders() + } + + /** + * Ensure the service is initialized + */ + private async ensureInitialized(): Promise { + if (!this.initialized) { + await this.initializeBuiltinProviders() + this.initialized = true + } + } + + /** + * Initialize built-in OCR providers + */ + private async initializeBuiltinProviders(): Promise { + try { + // Ensure built-in providers exist in database + await ocrProviderRepository.initializeBuiltInProviders() + + logger.info('OCR service initialized with built-in providers') + } catch (error) { + logger.error('Failed to initialize OCR service', error as Error) + throw error + } + } + + /** + * Register built-in providers (sync) + */ + private registerBuiltinProviders(): void { this.register(BuiltinOcrProviderIdMap.tesseract, tesseractService) if (systemOcrService) { @@ -46,158 +80,226 @@ export class OcrService { } } - private register(providerId: string, service: OcrBaseService): void { + /** + * Register an OCR provider service + */ + private register(providerId: OcrProviderId, service: OcrBaseService): void { if (this.registry.has(providerId)) { - logger.warn(`Provider ${providerId} has existing handler. Overwrited.`) + logger.warn(`Provider ${providerId} already registered. Overwriting.`) } this.registry.set(providerId, service) + logger.info(`Registered OCR provider: ${providerId}`) } - // @ts-expect-error not used for now, but just keep it. - private unregister(providerId: string): void { - this.registry.delete(providerId) + // Not sure when it will be needed. + /** + * Unregister an OCR provider service + */ + // private unregister(providerId: OcrProviderId): void { + // if (this.registry.delete(providerId)) { + // logger.info(`Unregistered OCR provider: ${providerId}`) + // } + // } + + /** + * Get all registered provider IDs + */ + public getRegisteredProviderIds(): OcrProviderId[] { + return Array.from(this.registry.keys()) } - public async listProviders(registered?: boolean): Promise { - const providers = await dbService.getDb().select().from(ocrProviderTable) - if (registered) { - const registeredKeys = Array.from(this.registry.keys()) - return { data: providers.filter((p) => registeredKeys.includes(p.id)) } - } else { - return { data: providers } - } + /** + * Check if a provider is registered + */ + public isProviderRegistered(providerId: OcrProviderId): boolean { + return this.registry.has(providerId) } - public async getProvider(providerId: string) { - const providers = await dbService - .getDb() - .select() - .from(ocrProviderTable) - .where(eq(ocrProviderTable.id, providerId)) - .limit(1) - if (providers.length === 0) { - throw new Error(`OCR provider ${providerId} not found`) - } - return { data: providers[0] } - } + /** + * Get list of OCR providers + */ + public async listProviders(query?: ListOcrProvidersQuery): Promise { + try { + await this.ensureInitialized() + const result = await ocrProviderRepository.findAll(query) - public async patchProvider(update: PatchOcrProviderRequest): Promise { - const providers = await dbService - .getDb() - .select() - .from(ocrProviderTable) - .where(eq(ocrProviderTable.id, update.id)) - .limit(1) - if (providers.length == 0) { - throw new Error(`OCR provider ${update.id} not found`) - } - const found = providers[0] - const newProvider = { ...merge({}, found, update), updatedAt: dayjs().valueOf() } satisfies DbOcrProvider - if (!isDbOcrProvider(newProvider)) { - throw new Error('Invalid OCR provider data') - } - const [updated] = await dbService - .getDb() - .update(ocrProviderTable) - .set(newProvider) - .where(eq(ocrProviderTable.id, update.id)) - .returning() - return { data: updated } - } - - public async createProvider(create: CreateOcrProviderRequest): Promise { - const providers = await dbService - .getDb() - .select() - .from(ocrProviderTable) - .where(eq(ocrProviderTable.id, create.id)) - .limit(1) - - if (providers.length > 0) { - throw new Error(`OCR provider ${create.id} already exists`) - } - - const timestamp = dayjs().valueOf() - const newProvider = { - ...create, - createdAt: timestamp, - updatedAt: timestamp - } satisfies DbOcrProvider - - if (!isDbOcrProvider(newProvider)) { - throw new Error('Invalid OCR provider data') - } - const [created] = await dbService.getDb().insert(ocrProviderTable).values(newProvider).returning() - - return { data: created } - } - - public async putProvider(provider: PutOcrProviderRequest): Promise { - if (BuiltinOcrProviderIds.some((pid) => pid === provider.id)) { - throw new Error('Builtin OCR providers cannot be modified with PUT method.') - } - const providers = await dbService - .getDb() - .select() - .from(ocrProviderTable) - .where(eq(ocrProviderTable.id, provider.id)) - .limit(1) - - const timestamp = dayjs().valueOf() - if (providers.length === 0) { - const newProvider = { - ...provider, - createdAt: timestamp, - updatedAt: timestamp - } satisfies DbOcrProvider - if (!isDbOcrProvider(newProvider)) { - throw new Error('Invalid OCR provider data') + if (query?.registered) { + // Filter by registered providers + const registeredIds = this.getRegisteredProviderIds() + result.data = result.data.filter((provider) => registeredIds.includes(provider.id)) } - const [created] = await dbService.getDb().insert(ocrProviderTable).values(newProvider).returning() - return { data: created } - } - const existed = providers[0] - const newProvider = { - ...provider, - updatedAt: timestamp, - createdAt: existed.createdAt - } satisfies DbOcrProvider - if (!isDbOcrProvider(newProvider)) { - throw new Error('Invalid OCR provider data') + logger.debug(`Listed ${result.data.length} OCR providers`) + return result + } catch (error) { + logger.error('Failed to list OCR providers', error as Error) + throw error } - const [updated] = await dbService - .getDb() - .update(ocrProviderTable) - .set(newProvider) - .where(eq(ocrProviderTable.id, provider.id)) - .returning() - - return { data: updated } } - public async deleteProvider(providerId: string): Promise { - if (BuiltinOcrProviderIds.some((pid) => pid === providerId)) { - throw new Error('Builtin OCR providers cannot be deleted.') + /** + * Get OCR provider by ID + */ + public async getProvider(providerId: OcrProviderId): Promise<{ data: DbOcrProvider }> { + try { + await this.ensureInitialized() + const provider = await ocrProviderRepository.findById(providerId) + logger.debug(`Retrieved OCR provider: ${providerId}`) + return { data: provider } + } catch (error) { + logger.error(`Failed to get OCR provider ${providerId}`, error as Error) + throw error } - const providers = await dbService - .getDb() - .select() - .from(ocrProviderTable) - .where(eq(ocrProviderTable.id, providerId)) - .limit(1) - if (providers.length === 0) { - throw new Error(`OCR provider ${providerId} not found`) - } - await dbService.getDb().delete(ocrProviderTable).where(eq(ocrProviderTable.id, providerId)) } + /** + * Create new OCR provider + */ + public async createProvider(data: CreateOcrProviderRequest): Promise { + try { + await this.ensureInitialized() + const result = await ocrProviderRepository.create(data) + logger.info(`Created OCR provider: ${data.id}`) + return result + } catch (error) { + logger.error(`Failed to create OCR provider ${data.id}`, error as Error) + throw error + } + } + + /** + * Update OCR provider (partial update) + */ + public async updateProvider(id: OcrProviderId, data: Partial): Promise { + try { + await this.ensureInitialized() + const result = await ocrProviderRepository.update(id, data) + logger.info(`Updated OCR provider: ${id}`) + return result + } catch (error) { + logger.error(`Failed to update OCR provider ${id}`, error as Error) + throw error + } + } + + /** + * Replace OCR provider (full update) + */ + public async replaceProvider(data: PutOcrProviderRequest): Promise { + try { + await this.ensureInitialized() + const result = await ocrProviderRepository.replace(data) + logger.info(`Replaced OCR provider: ${data.id}`) + return result + } catch (error) { + logger.error(`Failed to replace OCR provider ${data.id}`, error as Error) + throw error + } + } + + /** + * Delete OCR provider + */ + public async deleteProvider(id: OcrProviderId): Promise { + try { + await this.ensureInitialized() + await ocrProviderRepository.delete(id) + logger.info(`Deleted OCR provider: ${id}`) + } catch (error) { + logger.error(`Failed to delete OCR provider ${id}`, error as Error) + throw error + } + } + + /** + * Perform OCR on a file using the specified provider + */ public async ocr(file: SupportedOcrFile, params: OcrParams): Promise { - const service = this.registry.get(params.providerId) - if (!service) { - throw new Error(`Provider ${params.providerId} is not registered`) + try { + await this.ensureInitialized() + const service = this.registry.get(params.providerId) + if (!service) { + throw new Error(`Provider ${params.providerId} is not registered`) + } + + // Validate that the provider exists in database + await this.getProvider(params.providerId) + + logger.debug(`Performing OCR with provider: ${params.providerId}`) + const result = await service.ocr(file) + + logger.info(`OCR completed successfully with provider: ${params.providerId}`) + return result + } catch (error) { + logger.error(`OCR failed with provider ${params.providerId}`, error as Error) + throw error } - return service.ocr(file) + } + + /** + * Check if a provider is available and ready + */ + public async isProviderAvailable(providerId: OcrProviderId): Promise { + try { + const service = this.registry.get(providerId) + if (!service) { + return false + } + + // Check if provider exists in database + await this.getProvider(providerId) + + // Additional availability checks can be added here + return true + } catch (error) { + logger.debug(`Provider ${providerId} is not available`, error as Error) + return false + } + } + + private async _isProviderAvailable(provider: OcrProvider): Promise { + try { + return this.registry.get(provider.id) !== undefined + } catch (error) { + logger.debug(`Provider ${provider.id} is not available`, error as Error) + return false + } + } + + /** + * Get available providers + * It's only for image type. May re-designed for a specific file type in the future. + * + */ + public async getAvailableProvidersForFile(): Promise { + try { + const providers = await this.listProviders() + + // Filter providers that can handle the file type + // This logic can be extended based on file type and provider capabilities + const availableProviders: DbOcrProvider[] = [] + const capFilter = (provider: OcrProvider) => provider.capabilities.image + + for (const provider of providers.data.filter(capFilter)) { + if (await this._isProviderAvailable(provider)) { + availableProviders.push(provider) + } + } + + logger.debug(`Found ${availableProviders.length} available providers for file`) + return availableProviders + } catch (error) { + logger.error('Failed to get available providers for file', error as Error) + throw error + } + } + + /** + * Cleanup resources + */ + public dispose(): void { + this.registry.clear() + logger.info('OCR service disposed') } }