cherry-studio/src/main/knowledage/loader/noteLoader.ts
亢奋猫 9f29194180
refactor: Restructure the knowledge base directory (#7754)
重构知识库目录结构,代码逻辑完全不变

├── embeddings
│   ├── Embeddings.ts
│   ├── EmbeddingsFactory.ts
│   └── VoyageEmbeddings.ts
├── loader
│   ├── draftsExportLoader.ts
│   ├── epubLoader.ts
│   ├── index.ts
│   ├── noteLoader.ts
│   └── odLoader.ts
└── reranker
    ├── BaseReranker.ts
    ├── GeneralReranker.ts
    └── Reranker.ts

4 directories, 11 files
2025-07-02 15:23:02 +08:00

45 lines
1.1 KiB
TypeScript

import { BaseLoader } from '@cherrystudio/embedjs-interfaces'
import { cleanString } from '@cherrystudio/embedjs-utils'
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'
import md5 from 'md5'
export class NoteLoader extends BaseLoader<{ type: 'NoteLoader' }> {
private readonly text: string
private readonly sourceUrl?: string
constructor({
text,
sourceUrl,
chunkSize,
chunkOverlap
}: {
text: string
sourceUrl?: string
chunkSize?: number
chunkOverlap?: number
}) {
super(`NoteLoader_${md5(text + (sourceUrl || ''))}`, { text, sourceUrl }, chunkSize ?? 2000, chunkOverlap ?? 0)
this.text = text
this.sourceUrl = sourceUrl
}
override async *getUnfilteredChunks() {
const chunker = new RecursiveCharacterTextSplitter({
chunkSize: this.chunkSize,
chunkOverlap: this.chunkOverlap
})
const chunks = await chunker.splitText(cleanString(this.text))
for (const chunk of chunks) {
yield {
pageContent: chunk,
metadata: {
type: 'NoteLoader' as const,
source: this.sourceUrl || 'note'
}
}
}
}
}