Skip to content

Commit

Permalink
fix: LlamaParse json mode returns array + basic example (run-llama#914)
Browse files Browse the repository at this point in the history
Co-authored-by: Marcus Schiesser <[email protected]>
Co-authored-by: Marcus Schiesser <[email protected]>
  • Loading branch information
3 people authored Jun 11, 2024
1 parent 83b2f0b commit c8cfc6c
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 17 deletions.
Binary file added examples/data/uber_10q_march_2022.pdf
Binary file not shown.
3 changes: 2 additions & 1 deletion examples/readers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
"start:pdf": "node --import tsx ./src/pdf.ts",
"start:llamaparse": "node --import tsx ./src/llamaparse.ts",
"start:notion": "node --import tsx ./src/notion.ts",
"start:llamaparse-dir": "node --import tsx ./src/simple-directory-reader-with-llamaparse.ts"
"start:llamaparse-dir": "node --import tsx ./src/simple-directory-reader-with-llamaparse.ts",
"start:llamaparse-json": "node --import tsx ./src/llamaparse-json.ts"
},
"dependencies": {
"llamaindex": "*"
Expand Down
30 changes: 30 additions & 0 deletions examples/readers/src/llamaparse-json.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import fs from "fs/promises";
import { LlamaParseReader } from "llamaindex";

async function main() {
// Load PDF using LlamaParse json mode
const reader = new LlamaParseReader({ resultType: "json" });
const jsonObjs = await reader.loadJson("../data/uber_10q_march_2022.pdf");

// Write the JSON objects to a file
try {
await fs.writeFile("jsonObjs.json", JSON.stringify(jsonObjs, null, 4));
console.log("Array of JSON objects has been written to jsonObjs.json");
} catch (e) {
console.error("Error writing jsonObjs.json", e);
}

const jsonList = jsonObjs[0]["pages"];

// Write the list of JSON objects as a single array to a file
try {
await fs.writeFile("jsonList.json", JSON.stringify(jsonList, null, 4));
console.log(
"List of JSON objects as single array has been written to jsonList.json",
);
} catch (e) {
console.error("Error writing jsonList.json", e);
}
}

main().catch(console.error);
33 changes: 17 additions & 16 deletions packages/core/src/readers/LlamaParseReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,10 @@ export class LlamaParseReader extends FileReader {
// Create a job for the LlamaParse API
private async createJob(data: Buffer): Promise<string> {
// Load data, set the mime type
const mimeType = await this.getMimeType(data);
const { mimeType, extension } = await this.getMimeType(data);

if (this.verbose) {
console.log(`Starting load for file with mimeType: ${mimeType}`);
console.log(`Starting load for ${extension} file`);
}

const body = new FormData();
Expand Down Expand Up @@ -290,25 +290,25 @@ export class LlamaParseReader extends FileReader {
];
}
/**
* Loads data from a file and returns its contents as a JSON object.
* Loads data from a file and returns an array of JSON objects.
* To be used with resultType = "json"
*
* @param {string} file - The path to the file to be loaded.
* @return {Promise<Record<string, any>>} A Promise that resolves to the JSON object.
* @return {Promise<Record<string, any>[]>} A Promise that resolves to an array of JSON objects.
*/
async loadJson(file: string): Promise<Record<string, any>> {
async loadJson(file: string): Promise<Record<string, any>[]> {
const data = await fs.readFile(file);
// Creates a job for the file
const jobId = await this.createJob(data);
if (this.verbose) {
console.log(`Started parsing the file under job id ${jobId}`);
}

// Return results as JSON object
// Return results as an array of JSON objects (same format as Python version of the reader)
const resultJson = await this.getJobResult(jobId, "json");
resultJson.job_id = jobId;
resultJson.file_path = file;
return resultJson;
return [resultJson];
}

/**
Expand Down Expand Up @@ -370,18 +370,19 @@ export class LlamaParseReader extends FileReader {
return images;
}

private async getMimeType(data: Buffer): Promise<string> {
const mimes = filetypemime(data);
const validMime = mimes.find((mime) =>
Object.values(SupportedFiles).includes(mime),
);
if (!validMime) {
private async getMimeType(
data: Buffer,
): Promise<{ mimeType: string; extension: string }> {
const mimes = filetypemime(data); // Get an array of possible MIME types
const extension = Object.keys(SupportedFiles).find(
(ext) => SupportedFiles[ext] === mimes[0],
); // Find the extension for the first MIME type
if (!extension) {
const supportedExtensions = Object.keys(SupportedFiles).join(", ");
throw new Error(
`File has type "${mimes}" which does not match supported MIME Types. Supported formats include: ${supportedExtensions}`,
`File has type "${mimes[0]}" which does not match supported MIME Types. Supported formats include: ${supportedExtensions}`,
);
}

return validMime;
return { mimeType: mimes[0], extension }; // Return the first MIME type and its corresponding extension
}
}

0 comments on commit c8cfc6c

Please sign in to comment.