Skip to content

Commit

Permalink
feat: LlamaParseReader: update Supported File Types to match python v…
Browse files Browse the repository at this point in the history
…ersion (#823)
  • Loading branch information
KindOfAScam authored May 9, 2024
1 parent b99ab05 commit a1a72ab
Showing 1 changed file with 56 additions and 9 deletions.
65 changes: 56 additions & 9 deletions packages/core/src/readers/LlamaParseReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,49 @@ import { filetypemime } from "magic-bytes.js";
import { Document } from "../Node.js";
import type { FileReader, Language, ResultType } from "./type.js";

const SupportedFiles: { [key: string]: string } = {
".pdf": "application/pdf",
".doc": "application/msword",
".docx":
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".docm": "application/vnd.ms-word.document.macroEnabled.12",
".dot": "application/msword",
".dotx":
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
".dotm": "application/vnd.ms-word.template.macroEnabled.12",
".rtf": "application/rtf",
".wps": "application/vnd.ms-works",
".wpd": "application/wordperfect",
".sxw": "application/vnd.sun.xml.writer",
".stw": "application/vnd.sun.xml.writer.template",
".sxg": "application/vnd.sun.xml.writer.global",
".pages": "application/x-iwork-pages-sffpages",
".mw": "application/macwriteii",
".mcw": "application/macwriteii",
".uot": "application/x-uo",
".uof": "application/vnd.uoml+xml",
".uos": "application/vnd.sun.xml.calc",
".uop": "application/vnd.openofficeorg.presentation",
".ppt": "application/vnd.ms-powerpoint",
".pptx":
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
".pot": "application/vnd.ms-powerpoint",
".pptm": "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
".potx":
"application/vnd.openxmlformats-officedocument.presentationml.template",
".potm": "application/vnd.ms-powerpoint.template.macroEnabled.12",
".key": "application/x-iwork-keynote-sffkey",
".odp": "application/vnd.oasis.opendocument.presentation",
".odg": "application/vnd.oasis.opendocument.graphics",
".otp": "application/vnd.oasis.opendocument.presentation-template",
".fopd": "application/vnd.oasis.opendocument.presentation",
".sxi": "application/vnd.sun.xml.impress",
".sti": "application/vnd.sun.xml.impress.template",
".epub": "application/epub+zip",
".html": "text/html",
".htm": "text/html",
};

/**
* Represents a reader for parsing files using the LlamaParse API.
* See https://github.com/run-llama/llama_parse
Expand Down Expand Up @@ -40,15 +83,12 @@ export class LlamaParseReader implements FileReader {
file: string,
fs: GenericFileSystem = defaultFS,
): Promise<Document[]> {
if (!file.endsWith(".pdf")) {
throw new Error("Currently, only PDF files are supported.");
}

const metadata = { file_path: file };

// Load data, set the mime type
const data = await fs.readRawFile(file);
const mimeType = await this.getMimeType(data);

const body = new FormData();
body.set("file", new Blob([data], { type: mimeType }), file);
body.append("language", this.language);
Expand All @@ -67,7 +107,7 @@ export class LlamaParseReader implements FileReader {
headers,
});
if (!response.ok) {
throw new Error(`Failed to parse the PDF file: ${await response.text()}`);
throw new Error(`Failed to parse the file: ${await response.text()}`);
}
const jsonResponse = await response.json();

Expand All @@ -94,7 +134,7 @@ export class LlamaParseReader implements FileReader {
const end = Date.now();
if (end - start > this.maxTimeout * 1000) {
throw new Error(
`Timeout while parsing the PDF file: ${await response.text()}`,
`Timeout while parsing the file: ${await response.text()}`,
);
}
if (this.verbose && tries % 10 === 0) {
Expand All @@ -116,9 +156,16 @@ export class LlamaParseReader implements FileReader {

private async getMimeType(data: Buffer): Promise<string> {
const mimes = filetypemime(data);
if (!mimes.includes("application/pdf")) {
throw new Error("Currently, only PDF files are supported.");
const validMime = mimes.find((mime) =>
Object.values(SupportedFiles).includes(mime),
);
if (!validMime) {
const supportedExtensions = Object.keys(SupportedFiles).join(", ");
throw new Error(
`File has type "${mimes}" which does not match supported MIME Types. Supported formats include: ${supportedExtensions}`,
);
}
return "application/pdf";

return validMime;
}
}

0 comments on commit a1a72ab

Please sign in to comment.