diff --git a/packages/core/src/readers/MarkdownReader.ts b/packages/core/src/readers/MarkdownReader.ts index 90b2c7843a..a747497b8a 100644 --- a/packages/core/src/readers/MarkdownReader.ts +++ b/packages/core/src/readers/MarkdownReader.ts @@ -95,16 +95,16 @@ export class MarkdownReader implements FileReader { const content = await fs.readFile(file); const tups = this.parseTups(content); const results: Document[] = []; + let counter = 0; for (const [header, value] of tups) { + const id_ = `${file}_${counter}`; if (header) { - results.push( - new Document({ - text: `\n\n${header}\n${value}`, - }), - ); + const text = `\n\n${header}\n${value}`; + results.push(new Document({ text, id_ })); } else { - results.push(new Document({ text: value })); + results.push(new Document({ text: value, id_ })); } + counter += 1; } return results; } diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts index 46b3c08f33..ee1b1b2f70 100644 --- a/packages/core/src/readers/PDFReader.ts +++ b/packages/core/src/readers/PDFReader.ts @@ -1,5 +1,5 @@ import type { GenericFileSystem } from "@llamaindex/env"; -import { createSHA256, defaultFS } from "@llamaindex/env"; +import { defaultFS } from "@llamaindex/env"; import { Document } from "../Node.js"; import type { BaseReader } from "./type.js"; @@ -13,13 +13,9 @@ export class PDFReader implements BaseReader { ): Promise { const content = await fs.readRawFile(file); const text = await readPDF(content); - return text.map((text) => { - const sha256 = createSHA256(); - sha256.update(text); - return new Document({ - text, - id_: sha256.digest(), - }); + return text.map((text, page) => { + const id_ = `${file}_${page}`; + return new Document({ text, id_ }); }); } }