Skip to content

Commit

Permalink
feat: add Language and parsingInstruction to LlamaParseReader (#779)
Browse files Browse the repository at this point in the history
Co-authored-by: Alex Yang <[email protected]>
  • Loading branch information
KindOfAScam and himself65 authored Apr 28, 2024
1 parent b03f765 commit 1ab3ba4
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 4 deletions.
Binary file added examples/data/manga.pdf
Binary file not shown.
3 changes: 2 additions & 1 deletion examples/readers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"start:markdown": "node --import tsx ./src/markdown.ts",
"start:pdf": "node --import tsx ./src/pdf.ts",
"start:llamaparse": "node --import tsx ./src/llamaparse.ts",
"start:notion": "node --import tsx ./src/notion.ts"
"start:notion": "node --import tsx ./src/notion.ts",
"start:llamaparse2": "node --import tsx ./src/llamaparse_2.ts"
},
"dependencies": {
"llamaindex": "*"
Expand Down
26 changes: 26 additions & 0 deletions examples/readers/src/llamaparse_2.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import fs from "fs/promises";
import { LlamaParseReader } from "llamaindex";

async function main() {
// Load PDF using LlamaParse. set apiKey here or in environment variable LLAMA_CLOUD_API_KEY
const reader = new LlamaParseReader({
resultType: "markdown",
language: "en",
parsingInstruction:
"The provided document is a manga comic book. Most pages do NOT have title. It does not contain tables. Try to reconstruct the dialogue happening in a cohesive way. Output any math equation in LATEX markdown (between $$)",
});
const documents = await reader.loadData("../data/manga.pdf"); // The manga.pdf in the data folder is just a copy of the TOS, due to copyright laws. You have to place your own. I used "The Manga Guide to Calculus" by Hiroyuki Kojima

// Assuming documents contain an array of pages or sections
const parsedManga = documents.map((page) => page.text).join("\n---\n");

// Output the parsed manga to .md file. Will be placed in ../example/readers/
try {
await fs.writeFile("./parsedManga.md", parsedManga);
console.log("Output successfully written to parsedManga.md");
} catch (err) {
console.error("Error writing to file:", err);
}
}

main().catch(console.error);
11 changes: 8 additions & 3 deletions packages/core/src/readers/LlamaParseReader.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import { defaultFS, getEnv, type GenericFileSystem } from "@llamaindex/env";
import { filetypemime } from "magic-bytes.js";
import { Document } from "../Node.js";
import type { FileReader } from "./type.js";

type ResultType = "text" | "markdown" | "json";
import type { FileReader, Language, ResultType } from "./type.js";

/**
* Represents a reader for parsing files using the LlamaParse API.
Expand All @@ -20,7 +18,12 @@ export class LlamaParseReader implements FileReader {
checkInterval = 1;
// Whether to print the progress of the parsing.
verbose = true;
// The result type for the parser.
resultType: ResultType = "text";
// The language of the text to parse.
language: Language = "en";
// The parsing instruction for the parser.
parsingInstruction: string = "";

constructor(params: Partial<LlamaParseReader> = {}) {
Object.assign(this, params);
Expand Down Expand Up @@ -48,6 +51,8 @@ export class LlamaParseReader implements FileReader {
const mimeType = await this.getMimeType(data);
const body = new FormData();
body.set("file", new Blob([data], { type: mimeType }), file);
body.append("language", this.language);
body.append("parsingInstruction", this.parsingInstruction);

const headers = {
Authorization: `Bearer ${this.apiKey}`,
Expand Down
88 changes: 88 additions & 0 deletions packages/core/src/readers/type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,91 @@ export interface BaseReader {
export interface FileReader extends BaseReader {
loadData(filePath: string, fs?: CompleteFileSystem): Promise<Document[]>;
}

// For LlamaParseReader.ts

export type ResultType = "text" | "markdown" | "json";
export type Language =
| "abq"
| "ady"
| "af"
| "ang"
| "ar"
| "as"
| "ava"
| "az"
| "be"
| "bg"
| "bh"
| "bho"
| "bn"
| "bs"
| "ch_sim"
| "ch_tra"
| "che"
| "cs"
| "cy"
| "da"
| "dar"
| "de"
| "en"
| "es"
| "et"
| "fa"
| "fr"
| "ga"
| "gom"
| "hi"
| "hr"
| "hu"
| "id"
| "inh"
| "is"
| "it"
| "ja"
| "kbd"
| "kn"
| "ko"
| "ku"
| "la"
| "lbe"
| "lez"
| "lt"
| "lv"
| "mah"
| "mai"
| "mi"
| "mn"
| "mr"
| "ms"
| "mt"
| "ne"
| "new"
| "nl"
| "no"
| "oc"
| "pi"
| "pl"
| "pt"
| "ro"
| "ru"
| "rs_cyrillic"
| "rs_latin"
| "sck"
| "sk"
| "sl"
| "sq"
| "sv"
| "sw"
| "ta"
| "tab"
| "te"
| "th"
| "tjk"
| "tl"
| "tr"
| "ug"
| "uk"
| "ur"
| "uz"
| "vi";

0 comments on commit 1ab3ba4

Please sign in to comment.