mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2025-12-22 08:40:08 +08:00
249 lines
7.0 KiB
TypeScript
249 lines
7.0 KiB
TypeScript
import { BaseLoader } from '@cherrystudio/embedjs-interfaces'
|
||
import { cleanString } from '@cherrystudio/embedjs-utils'
|
||
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'
|
||
import { getTempDir } from '@main/utils/file'
|
||
import Logger from 'electron-log'
|
||
import EPub from 'epub'
|
||
import * as fs from 'fs'
|
||
import path from 'path'
|
||
|
||
/**
|
||
* epub 加载器的配置选项
|
||
*/
|
||
interface EpubLoaderOptions {
|
||
/** epub 文件路径 */
|
||
filePath: string
|
||
/** 文本分块大小 */
|
||
chunkSize: number
|
||
/** 分块重叠大小 */
|
||
chunkOverlap: number
|
||
}
|
||
|
||
/**
|
||
* epub 文件的元数据信息
|
||
*/
|
||
interface EpubMetadata {
|
||
/** 作者显示名称(例如:"Lewis Carroll") */
|
||
creator?: string
|
||
/** 作者规范化名称,用于排序和索引(例如:"Carroll, Lewis") */
|
||
creatorFileAs?: string
|
||
/** 书籍标题(例如:"Alice's Adventures in Wonderland") */
|
||
title?: string
|
||
/** 语言代码(例如:"en" 或 "zh-CN") */
|
||
language?: string
|
||
/** 主题或分类(例如:"Fantasy"、"Fiction") */
|
||
subject?: string
|
||
/** 创建日期(例如:"2024-02-14") */
|
||
date?: string
|
||
/** 书籍描述或简介 */
|
||
description?: string
|
||
}
|
||
|
||
/**
|
||
* epub 章节信息
|
||
*/
|
||
interface EpubChapter {
|
||
/** 章节 ID */
|
||
id: string
|
||
/** 章节标题 */
|
||
title?: string
|
||
/** 章节顺序 */
|
||
order?: number
|
||
}
|
||
|
||
/**
|
||
* epub 文件加载器
|
||
* 用于解析 epub 电子书文件,提取文本内容和元数据
|
||
*/
|
||
export class EpubLoader extends BaseLoader<Record<string, string | number | boolean>, Record<string, unknown>> {
|
||
protected filePath: string
|
||
protected chunkSize: number
|
||
protected chunkOverlap: number
|
||
private extractedText: string
|
||
private metadata: EpubMetadata | null
|
||
|
||
/**
|
||
* 创建 epub 加载器实例
|
||
* @param options 加载器配置选项
|
||
*/
|
||
constructor(options: EpubLoaderOptions) {
|
||
super(options.filePath, {
|
||
chunkSize: options.chunkSize,
|
||
chunkOverlap: options.chunkOverlap
|
||
})
|
||
this.filePath = options.filePath
|
||
this.chunkSize = options.chunkSize
|
||
this.chunkOverlap = options.chunkOverlap
|
||
this.extractedText = ''
|
||
this.metadata = null
|
||
}
|
||
|
||
/**
|
||
* 等待 epub 文件初始化完成
|
||
* epub 库使用事件机制,需要等待 'end' 事件触发后才能访问文件内容
|
||
* @param epub epub 实例
|
||
* @returns 元数据和章节信息
|
||
*/
|
||
private waitForEpubInit(epub: any): Promise<{ metadata: EpubMetadata; chapters: EpubChapter[] }> {
|
||
return new Promise((resolve, reject) => {
|
||
epub.on('end', () => {
|
||
// 提取元数据
|
||
const metadata: EpubMetadata = {
|
||
creator: epub.metadata.creator,
|
||
creatorFileAs: epub.metadata.creatorFileAs,
|
||
title: epub.metadata.title,
|
||
language: epub.metadata.language,
|
||
subject: epub.metadata.subject,
|
||
date: epub.metadata.date,
|
||
description: epub.metadata.description
|
||
}
|
||
|
||
// 提取章节信息
|
||
const chapters: EpubChapter[] = epub.flow.map((chapter: any, index: number) => ({
|
||
id: chapter.id,
|
||
title: chapter.title || `Chapter ${index + 1}`,
|
||
order: index + 1
|
||
}))
|
||
|
||
resolve({ metadata, chapters })
|
||
})
|
||
|
||
epub.on('error', (error: Error) => {
|
||
reject(error)
|
||
})
|
||
|
||
epub.parse()
|
||
})
|
||
}
|
||
|
||
/**
|
||
* 获取章节内容
|
||
* @param epub epub 实例
|
||
* @param chapterId 章节 ID
|
||
* @returns 章节文本内容
|
||
*/
|
||
private getChapter(epub: any, chapterId: string): Promise<string> {
|
||
return new Promise((resolve, reject) => {
|
||
epub.getChapter(chapterId, (error: Error | null, text: string) => {
|
||
if (error) {
|
||
reject(error)
|
||
} else {
|
||
resolve(text)
|
||
}
|
||
})
|
||
})
|
||
}
|
||
|
||
/**
|
||
* 从 epub 文件中提取文本内容
|
||
* 1. 检查文件是否存在
|
||
* 2. 初始化 epub 并获取元数据
|
||
* 3. 遍历所有章节并提取文本
|
||
* 4. 清理 HTML 标签
|
||
* 5. 合并所有章节文本
|
||
*/
|
||
private async extractTextFromEpub() {
|
||
try {
|
||
// 检查文件是否存在
|
||
if (!fs.existsSync(this.filePath)) {
|
||
throw new Error(`File not found: ${this.filePath}`)
|
||
}
|
||
|
||
const epub = new EPub(this.filePath)
|
||
|
||
// 等待 epub 初始化完成并获取元数据
|
||
const { metadata, chapters } = await this.waitForEpubInit(epub)
|
||
this.metadata = metadata
|
||
|
||
if (!epub.flow || epub.flow.length === 0) {
|
||
throw new Error('No content found in epub file')
|
||
}
|
||
|
||
// 使用临时文件而不是内存数组
|
||
const tempFilePath = path.join(getTempDir(), `epub-${Date.now()}.txt`)
|
||
const writeStream = fs.createWriteStream(tempFilePath)
|
||
|
||
// 遍历所有章节
|
||
for (const chapter of chapters) {
|
||
try {
|
||
const content = await this.getChapter(epub, chapter.id)
|
||
|
||
if (!content) {
|
||
continue
|
||
}
|
||
|
||
// 移除 HTML 标签并清理文本
|
||
const text = content
|
||
.replace(/<[^>]*>/g, ' ') // 移除所有 HTML 标签
|
||
.replace(/\s+/g, ' ') // 将多个空白字符替换为单个空格
|
||
.trim() // 移除首尾空白
|
||
|
||
if (text) {
|
||
// 直接写入文件
|
||
writeStream.write(text + '\n\n')
|
||
}
|
||
} catch (error) {
|
||
Logger.error(`[EpubLoader] Error processing chapter ${chapter.id}:`, error)
|
||
}
|
||
}
|
||
|
||
// 关闭写入流
|
||
writeStream.end()
|
||
|
||
// 等待写入完成
|
||
await new Promise<void>((resolve, reject) => {
|
||
writeStream.on('finish', resolve)
|
||
writeStream.on('error', reject)
|
||
})
|
||
|
||
// 从临时文件读取内容
|
||
this.extractedText = fs.readFileSync(tempFilePath, 'utf-8')
|
||
|
||
// 删除临时文件
|
||
fs.unlinkSync(tempFilePath)
|
||
|
||
// 只添加一条完成日志
|
||
Logger.info(`[EpubLoader] 电子书 ${this.metadata?.title || path.basename(this.filePath)} 处理完成`)
|
||
} catch (error) {
|
||
Logger.error('[EpubLoader] Error in extractTextFromEpub:', error)
|
||
throw error
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 生成文本块
|
||
* 重写 BaseLoader 的方法,将提取的文本分割成适当大小的块
|
||
* 每个块都包含源文件和元数据信息
|
||
*/
|
||
override async *getUnfilteredChunks() {
|
||
// 如果还没有提取文本,先提取
|
||
if (!this.extractedText) {
|
||
await this.extractTextFromEpub()
|
||
}
|
||
|
||
Logger.info('[EpubLoader] 书名:', this.metadata?.title || '未知书名', ' 文本大小:', this.extractedText.length)
|
||
|
||
// 创建文本分块器
|
||
const chunker = new RecursiveCharacterTextSplitter({
|
||
chunkSize: this.chunkSize,
|
||
chunkOverlap: this.chunkOverlap
|
||
})
|
||
|
||
// 清理并分割文本
|
||
const chunks = await chunker.splitText(cleanString(this.extractedText))
|
||
|
||
// 为每个文本块添加元数据
|
||
for (const chunk of chunks) {
|
||
yield {
|
||
pageContent: chunk,
|
||
metadata: {
|
||
source: this.filePath,
|
||
title: this.metadata?.title || '',
|
||
creator: this.metadata?.creator || '',
|
||
language: this.metadata?.language || ''
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|