From 645fcf6c24e70ce110a803a4006a3b54ae2f74d2 Mon Sep 17 00:00:00 2001 From: ezirmusitua Date: Tue, 7 May 2024 11:07:39 +0800 Subject: [PATCH] fix: use sha256 hash value as the `Document.id_` in `MarkdownReader` (#768) Co-authored-by: Alex Yang --- packages/core/src/readers/MarkdownReader.ts | 12 ++++++------ packages/core/src/readers/PDFReader.ts | 12 ++++-------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/packages/core/src/readers/MarkdownReader.ts b/packages/core/src/readers/MarkdownReader.ts index 90b2c7843a..a747497b8a 100644 --- a/packages/core/src/readers/MarkdownReader.ts +++ b/packages/core/src/readers/MarkdownReader.ts @@ -95,16 +95,16 @@ export class MarkdownReader implements FileReader { const content = await fs.readFile(file); const tups = this.parseTups(content); const results: Document[] = []; + let counter = 0; for (const [header, value] of tups) { + const id_ = `${file}_${counter}`; if (header) { - results.push( - new Document({ - text: `\n\n${header}\n${value}`, - }), - ); + const text = `\n\n${header}\n${value}`; + results.push(new Document({ text, id_ })); } else { - results.push(new Document({ text: value })); + results.push(new Document({ text: value, id_ })); } + counter += 1; } return results; } diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts index 46b3c08f33..ee1b1b2f70 100644 --- a/packages/core/src/readers/PDFReader.ts +++ b/packages/core/src/readers/PDFReader.ts @@ -1,5 +1,5 @@ import type { GenericFileSystem } from "@llamaindex/env"; -import { createSHA256, defaultFS } from "@llamaindex/env"; +import { defaultFS } from "@llamaindex/env"; import { Document } from "../Node.js"; import type { BaseReader } from "./type.js"; @@ -13,13 +13,9 @@ export class PDFReader implements BaseReader { ): Promise { const content = await fs.readRawFile(file); const text = await readPDF(content); - return text.map((text) => { - const sha256 = createSHA256(); - sha256.update(text); - return new Document({ - text, - id_: sha256.digest(), - }); + return text.map((text, page) => { + const id_ = `${file}_${page}`; + return new Document({ text, id_ }); }); } }