diff --git a/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch b/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch
new file mode 100644
index 0000000000..0cb156ee99
--- /dev/null
+++ b/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch
@@ -0,0 +1,348 @@
+diff --git a/src/constants/languages.d.ts b/src/constants/languages.d.ts
+new file mode 100644
+index 0000000000000000000000000000000000000000..6a2ba5086187622b8ca8887bcc7406018fba8a89
+--- /dev/null
++++ b/src/constants/languages.d.ts
+@@ -0,0 +1,43 @@
++/**
++ * Languages with existing tesseract traineddata
++ * https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
++ */
++
++// Define the language codes as string literals
++type LanguageCode =
++ | 'afr' | 'amh' | 'ara' | 'asm' | 'aze' | 'aze_cyrl' | 'bel' | 'ben' | 'bod' | 'bos'
++ | 'bul' | 'cat' | 'ceb' | 'ces' | 'chi_sim' | 'chi_tra' | 'chr' | 'cym' | 'dan' | 'deu'
++ | 'dzo' | 'ell' | 'eng' | 'enm' | 'epo' | 'est' | 'eus' | 'fas' | 'fin' | 'fra'
++ | 'frk' | 'frm' | 'gle' | 'glg' | 'grc' | 'guj' | 'hat' | 'heb' | 'hin' | 'hrv'
++ | 'hun' | 'iku' | 'ind' | 'isl' | 'ita' | 'ita_old' | 'jav' | 'jpn' | 'kan' | 'kat'
++ | 'kat_old' | 'kaz' | 'khm' | 'kir' | 'kor' | 'kur' | 'lao' | 'lat' | 'lav' | 'lit'
++ | 'mal' | 'mar' | 'mkd' | 'mlt' | 'msa' | 'mya' | 'nep' | 'nld' | 'nor' | 'ori'
++ | 'pan' | 'pol' | 'por' | 'pus' | 'ron' | 'rus' | 'san' | 'sin' | 'slk' | 'slv'
++ | 'spa' | 'spa_old' | 'sqi' | 'srp' | 'srp_latn' | 'swa' | 'swe' | 'syr' | 'tam' | 'tel'
++ | 'tgk' | 'tgl' | 'tha' | 'tir' | 'tur' | 'uig' | 'ukr' | 'urd' | 'uzb' | 'uzb_cyrl'
++ | 'vie' | 'yid';
++
++// Define the language keys as string literals
++type LanguageKey =
++ | 'AFR' | 'AMH' | 'ARA' | 'ASM' | 'AZE' | 'AZE_CYRL' | 'BEL' | 'BEN' | 'BOD' | 'BOS'
++ | 'BUL' | 'CAT' | 'CEB' | 'CES' | 'CHI_SIM' | 'CHI_TRA' | 'CHR' | 'CYM' | 'DAN' | 'DEU'
++ | 'DZO' | 'ELL' | 'ENG' | 'ENM' | 'EPO' | 'EST' | 'EUS' | 'FAS' | 'FIN' | 'FRA'
++ | 'FRK' | 'FRM' | 'GLE' | 'GLG' | 'GRC' | 'GUJ' | 'HAT' | 'HEB' | 'HIN' | 'HRV'
++ | 'HUN' | 'IKU' | 'IND' | 'ISL' | 'ITA' | 'ITA_OLD' | 'JAV' | 'JPN' | 'KAN' | 'KAT'
++ | 'KAT_OLD' | 'KAZ' | 'KHM' | 'KIR' | 'KOR' | 'KUR' | 'LAO' | 'LAT' | 'LAV' | 'LIT'
++ | 'MAL' | 'MAR' | 'MKD' | 'MLT' | 'MSA' | 'MYA' | 'NEP' | 'NLD' | 'NOR' | 'ORI'
++ | 'PAN' | 'POL' | 'POR' | 'PUS' | 'RON' | 'RUS' | 'SAN' | 'SIN' | 'SLK' | 'SLV'
++ | 'SPA' | 'SPA_OLD' | 'SQI' | 'SRP' | 'SRP_LATN' | 'SWA' | 'SWE' | 'SYR' | 'TAM' | 'TEL'
++ | 'TGK' | 'TGL' | 'THA' | 'TIR' | 'TUR' | 'UIG' | 'UKR' | 'URD' | 'UZB' | 'UZB_CYRL'
++ | 'VIE' | 'YID';
++
++// Create a mapped type to ensure each key maps to its specific value
++type LanguagesMap = {
++ [K in LanguageKey]: LanguageCode;
++};
++
++// Declare the exported constant with the specific type
++export const LANGUAGES: LanguagesMap;
++
++// Export the individual types for use in other modules
++export type { LanguageCode, LanguageKey, LanguagesMap };
+\ No newline at end of file
+diff --git a/src/index.d.ts b/src/index.d.ts
+index 1f5a9c8094fe4de7983467f9efb43bdb4de535f2..16dc95cf68663673e37e189b719cb74897b7735f 100644
+--- a/src/index.d.ts
++++ b/src/index.d.ts
+@@ -1,31 +1,74 @@
++// Import the languages types
++import { LanguagesMap } from "./constants/languages";
++
++///
++
+ declare namespace Tesseract {
+- function createScheduler(): Scheduler
+- function createWorker(langs?: string | string[] | Lang[], oem?: OEM, options?: Partial, config?: string | Partial): Promise
+- function setLogging(logging: boolean): void
+- function recognize(image: ImageLike, langs?: string, options?: Partial): Promise
+- function detect(image: ImageLike, options?: Partial): any
++ function createScheduler(): Scheduler;
++ function createWorker(
++ langs?: LanguageCode | LanguageCode[] | Lang[],
++ oem?: OEM,
++ options?: Partial,
++ config?: string | Partial
++ ): Promise;
++ function setLogging(logging: boolean): void;
++ function recognize(
++ image: ImageLike,
++ langs?: LanguageCode,
++ options?: Partial
++ ): Promise;
++ function detect(image: ImageLike, options?: Partial): any;
++
++ // Export languages constant
++ const languages: LanguagesMap;
++
++ type LanguageCode = import("./constants/languages").LanguageCode;
++ type LanguageKey = import("./constants/languages").LanguageKey;
+
+ interface Scheduler {
+- addWorker(worker: Worker): string
+- addJob(action: 'recognize', ...args: Parameters): Promise
+- addJob(action: 'detect', ...args: Parameters): Promise
+- terminate(): Promise
+- getQueueLen(): number
+- getNumWorkers(): number
++ addWorker(worker: Worker): string;
++ addJob(
++ action: "recognize",
++ ...args: Parameters
++ ): Promise;
++ addJob(
++ action: "detect",
++ ...args: Parameters
++ ): Promise;
++ terminate(): Promise;
++ getQueueLen(): number;
++ getNumWorkers(): number;
+ }
+
+ interface Worker {
+- load(jobId?: string): Promise
+- writeText(path: string, text: string, jobId?: string): Promise
+- readText(path: string, jobId?: string): Promise
+- removeText(path: string, jobId?: string): Promise
+- FS(method: string, args: any[], jobId?: string): Promise
+- reinitialize(langs?: string | Lang[], oem?: OEM, config?: string | Partial, jobId?: string): Promise
+- setParameters(params: Partial, jobId?: string): Promise
+- getImage(type: imageType): string
+- recognize(image: ImageLike, options?: Partial, output?: Partial, jobId?: string): Promise
+- detect(image: ImageLike, jobId?: string): Promise
+- terminate(jobId?: string): Promise
++ load(jobId?: string): Promise;
++ writeText(
++ path: string,
++ text: string,
++ jobId?: string
++ ): Promise;
++ readText(path: string, jobId?: string): Promise;
++ removeText(path: string, jobId?: string): Promise;
++ FS(method: string, args: any[], jobId?: string): Promise;
++ reinitialize(
++ langs?: string | Lang[],
++ oem?: OEM,
++ config?: string | Partial,
++ jobId?: string
++ ): Promise;
++ setParameters(
++ params: Partial,
++ jobId?: string
++ ): Promise;
++ getImage(type: imageType): string;
++ recognize(
++ image: ImageLike,
++ options?: Partial,
++ output?: Partial,
++ jobId?: string
++ ): Promise;
++ detect(image: ImageLike, jobId?: string): Promise;
++ terminate(jobId?: string): Promise;
+ }
+
+ interface Lang {
+@@ -34,43 +77,43 @@ declare namespace Tesseract {
+ }
+
+ interface InitOptions {
+- load_system_dawg: string
+- load_freq_dawg: string
+- load_unambig_dawg: string
+- load_punc_dawg: string
+- load_number_dawg: string
+- load_bigram_dawg: string
+- }
+-
+- type LoggerMessage = {
+- jobId: string
+- progress: number
+- status: string
+- userJobId: string
+- workerId: string
++ load_system_dawg: string;
++ load_freq_dawg: string;
++ load_unambig_dawg: string;
++ load_punc_dawg: string;
++ load_number_dawg: string;
++ load_bigram_dawg: string;
+ }
+-
++
++ type LoggerMessage = {
++ jobId: string;
++ progress: number;
++ status: string;
++ userJobId: string;
++ workerId: string;
++ };
++
+ interface WorkerOptions {
+- corePath: string
+- langPath: string
+- cachePath: string
+- dataPath: string
+- workerPath: string
+- cacheMethod: string
+- workerBlobURL: boolean
+- gzip: boolean
+- legacyLang: boolean
+- legacyCore: boolean
+- logger: (arg: LoggerMessage) => void,
+- errorHandler: (arg: any) => void
++ corePath: string;
++ langPath: string;
++ cachePath: string;
++ dataPath: string;
++ workerPath: string;
++ cacheMethod: string;
++ workerBlobURL: boolean;
++ gzip: boolean;
++ legacyLang: boolean;
++ legacyCore: boolean;
++ logger: (arg: LoggerMessage) => void;
++ errorHandler: (arg: any) => void;
+ }
+ interface WorkerParams {
+- tessedit_pageseg_mode: PSM
+- tessedit_char_whitelist: string
+- tessedit_char_blacklist: string
+- preserve_interword_spaces: string
+- user_defined_dpi: string
+- [propName: string]: any
++ tessedit_pageseg_mode: PSM;
++ tessedit_char_whitelist: string;
++ tessedit_char_blacklist: string;
++ preserve_interword_spaces: string;
++ user_defined_dpi: string;
++ [propName: string]: any;
+ }
+ interface OutputFormats {
+ text: boolean;
+@@ -88,36 +131,36 @@ declare namespace Tesseract {
+ debug: boolean;
+ }
+ interface RecognizeOptions {
+- rectangle: Rectangle
+- pdfTitle: string
+- pdfTextOnly: boolean
+- rotateAuto: boolean
+- rotateRadians: number
++ rectangle: Rectangle;
++ pdfTitle: string;
++ pdfTextOnly: boolean;
++ rotateAuto: boolean;
++ rotateRadians: number;
+ }
+ interface ConfigResult {
+- jobId: string
+- data: any
++ jobId: string;
++ data: any;
+ }
+ interface RecognizeResult {
+- jobId: string
+- data: Page
++ jobId: string;
++ data: Page;
+ }
+ interface DetectResult {
+- jobId: string
+- data: DetectData
++ jobId: string;
++ data: DetectData;
+ }
+ interface DetectData {
+- tesseract_script_id: number | null
+- script: string | null
+- script_confidence: number | null
+- orientation_degrees: number | null
+- orientation_confidence: number | null
++ tesseract_script_id: number | null;
++ script: string | null;
++ script_confidence: number | null;
++ orientation_degrees: number | null;
++ orientation_confidence: number | null;
+ }
+ interface Rectangle {
+- left: number
+- top: number
+- width: number
+- height: number
++ left: number;
++ top: number;
++ width: number;
++ height: number;
+ }
+ enum OEM {
+ TESSERACT_ONLY,
+@@ -126,28 +169,36 @@ declare namespace Tesseract {
+ DEFAULT,
+ }
+ enum PSM {
+- OSD_ONLY = '0',
+- AUTO_OSD = '1',
+- AUTO_ONLY = '2',
+- AUTO = '3',
+- SINGLE_COLUMN = '4',
+- SINGLE_BLOCK_VERT_TEXT = '5',
+- SINGLE_BLOCK = '6',
+- SINGLE_LINE = '7',
+- SINGLE_WORD = '8',
+- CIRCLE_WORD = '9',
+- SINGLE_CHAR = '10',
+- SPARSE_TEXT = '11',
+- SPARSE_TEXT_OSD = '12',
+- RAW_LINE = '13'
++ OSD_ONLY = "0",
++ AUTO_OSD = "1",
++ AUTO_ONLY = "2",
++ AUTO = "3",
++ SINGLE_COLUMN = "4",
++ SINGLE_BLOCK_VERT_TEXT = "5",
++ SINGLE_BLOCK = "6",
++ SINGLE_LINE = "7",
++ SINGLE_WORD = "8",
++ CIRCLE_WORD = "9",
++ SINGLE_CHAR = "10",
++ SPARSE_TEXT = "11",
++ SPARSE_TEXT_OSD = "12",
++ RAW_LINE = "13",
+ }
+ const enum imageType {
+ COLOR = 0,
+ GREY = 1,
+- BINARY = 2
++ BINARY = 2,
+ }
+- type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement
+- | CanvasRenderingContext2D | File | Blob | Buffer | OffscreenCanvas;
++ type ImageLike =
++ | string
++ | HTMLImageElement
++ | HTMLCanvasElement
++ | HTMLVideoElement
++ | CanvasRenderingContext2D
++ | File
++ | Blob
++ | (typeof Buffer extends undefined ? never : Buffer)
++ | OffscreenCanvas;
+ interface Block {
+ paragraphs: Paragraph[];
+ text: string;
+@@ -179,7 +230,7 @@ declare namespace Tesseract {
+ text: string;
+ confidence: number;
+ baseline: Baseline;
+- rowAttributes: RowAttributes
++ rowAttributes: RowAttributes;
+ bbox: Bbox;
+ }
+ interface Paragraph {
diff --git a/package.json b/package.json
index 49f7340559..c734271de5 100644
--- a/package.json
+++ b/package.json
@@ -79,7 +79,7 @@
"officeparser": "^4.2.0",
"os-proxy-config": "^1.1.2",
"selection-hook": "^1.0.11",
- "tesseract.js": "^6.0.1",
+ "tesseract.js": "patch:tesseract.js@npm%3A6.0.1#~/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch",
"turndown": "7.2.0"
},
"devDependencies": {
@@ -293,7 +293,8 @@
"pdf-parse@npm:1.1.1": "patch:pdf-parse@npm%3A1.1.1#~/.yarn/patches/pdf-parse-npm-1.1.1-04a6109b2a.patch",
"pkce-challenge@npm:^4.1.0": "patch:pkce-challenge@npm%3A4.1.0#~/.yarn/patches/pkce-challenge-npm-4.1.0-fbc51695a3.patch",
"undici": "6.21.2",
- "vite": "npm:rolldown-vite@latest"
+ "vite": "npm:rolldown-vite@latest",
+ "tesseract.js@npm:*": "patch:tesseract.js@npm%3A6.0.1#~/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch"
},
"packageManager": "yarn@4.9.1",
"lint-staged": {
diff --git a/yarn.lock b/yarn.lock
index 7b9986f6c0..f57df818e7 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -8579,7 +8579,7 @@ __metadata:
string-width: "npm:^7.2.0"
styled-components: "npm:^6.1.11"
tar: "npm:^7.4.3"
- tesseract.js: "npm:^6.0.1"
+ tesseract.js: "patch:tesseract.js@npm%3A6.0.1#~/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch"
tiny-pinyin: "npm:^1.3.2"
tokenx: "npm:^1.1.0"
tsx: "npm:^4.20.3"
@@ -20978,7 +20978,7 @@ __metadata:
languageName: node
linkType: hard
-"tesseract.js@npm:*, tesseract.js@npm:^6.0.1":
+"tesseract.js@npm:6.0.1":
version: 6.0.1
resolution: "tesseract.js@npm:6.0.1"
dependencies:
@@ -20995,6 +20995,23 @@ __metadata:
languageName: node
linkType: hard
+"tesseract.js@patch:tesseract.js@npm%3A6.0.1#~/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch":
+ version: 6.0.1
+ resolution: "tesseract.js@patch:tesseract.js@npm%3A6.0.1#~/.yarn/patches/tesseract.js-npm-6.0.1-2562a7e46d.patch::version=6.0.1&hash=a9cf7b"
+ dependencies:
+ bmp-js: "npm:^0.1.0"
+ idb-keyval: "npm:^6.2.0"
+ is-url: "npm:^1.2.4"
+ node-fetch: "npm:^2.6.9"
+ opencollective-postinstall: "npm:^2.0.3"
+ regenerator-runtime: "npm:^0.13.3"
+ tesseract.js-core: "npm:^6.0.0"
+ wasm-feature-detect: "npm:^1.2.11"
+ zlibjs: "npm:^0.3.1"
+ checksum: 10c0/8a94fcc688ff21a9e82b721563d8fa174837ba807d0f01290fe9a1bb6a1c96ecaf7dc1c83510510f3d5185abd15f1cc5fc3cb7ad6c0eee0c4b3e278106f8a5da
+ languageName: node
+ linkType: hard
+
"test-exclude@npm:^7.0.1":
version: 7.0.1
resolution: "test-exclude@npm:7.0.1"