diff --git a/packages/core/src/readers/LlamaParseReader.ts b/packages/core/src/readers/LlamaParseReader.ts index 8176117432..327878c4c1 100644 --- a/packages/core/src/readers/LlamaParseReader.ts +++ b/packages/core/src/readers/LlamaParseReader.ts @@ -3,6 +3,49 @@ import { filetypemime } from "magic-bytes.js"; import { Document } from "../Node.js"; import type { FileReader, Language, ResultType } from "./type.js"; +const SupportedFiles: { [key: string]: string } = { + ".pdf": "application/pdf", + ".doc": "application/msword", + ".docx": + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".docm": "application/vnd.ms-word.document.macroEnabled.12", + ".dot": "application/msword", + ".dotx": + "application/vnd.openxmlformats-officedocument.wordprocessingml.template", + ".dotm": "application/vnd.ms-word.template.macroEnabled.12", + ".rtf": "application/rtf", + ".wps": "application/vnd.ms-works", + ".wpd": "application/wordperfect", + ".sxw": "application/vnd.sun.xml.writer", + ".stw": "application/vnd.sun.xml.writer.template", + ".sxg": "application/vnd.sun.xml.writer.global", + ".pages": "application/x-iwork-pages-sffpages", + ".mw": "application/macwriteii", + ".mcw": "application/macwriteii", + ".uot": "application/x-uo", + ".uof": "application/vnd.uoml+xml", + ".uos": "application/vnd.sun.xml.calc", + ".uop": "application/vnd.openofficeorg.presentation", + ".ppt": "application/vnd.ms-powerpoint", + ".pptx": + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ".pot": "application/vnd.ms-powerpoint", + ".pptm": "application/vnd.ms-powerpoint.presentation.macroEnabled.12", + ".potx": + "application/vnd.openxmlformats-officedocument.presentationml.template", + ".potm": "application/vnd.ms-powerpoint.template.macroEnabled.12", + ".key": "application/x-iwork-keynote-sffkey", + ".odp": "application/vnd.oasis.opendocument.presentation", + ".odg": "application/vnd.oasis.opendocument.graphics", + ".otp": "application/vnd.oasis.opendocument.presentation-template", + ".fopd": "application/vnd.oasis.opendocument.presentation", + ".sxi": "application/vnd.sun.xml.impress", + ".sti": "application/vnd.sun.xml.impress.template", + ".epub": "application/epub+zip", + ".html": "text/html", + ".htm": "text/html", +}; + /** * Represents a reader for parsing files using the LlamaParse API. * See https://github.com/run-llama/llama_parse @@ -40,15 +83,12 @@ export class LlamaParseReader implements FileReader { file: string, fs: GenericFileSystem = defaultFS, ): Promise { - if (!file.endsWith(".pdf")) { - throw new Error("Currently, only PDF files are supported."); - } - const metadata = { file_path: file }; // Load data, set the mime type const data = await fs.readRawFile(file); const mimeType = await this.getMimeType(data); + const body = new FormData(); body.set("file", new Blob([data], { type: mimeType }), file); body.append("language", this.language); @@ -67,7 +107,7 @@ export class LlamaParseReader implements FileReader { headers, }); if (!response.ok) { - throw new Error(`Failed to parse the PDF file: ${await response.text()}`); + throw new Error(`Failed to parse the file: ${await response.text()}`); } const jsonResponse = await response.json(); @@ -94,7 +134,7 @@ export class LlamaParseReader implements FileReader { const end = Date.now(); if (end - start > this.maxTimeout * 1000) { throw new Error( - `Timeout while parsing the PDF file: ${await response.text()}`, + `Timeout while parsing the file: ${await response.text()}`, ); } if (this.verbose && tries % 10 === 0) { @@ -116,9 +156,16 @@ export class LlamaParseReader implements FileReader { private async getMimeType(data: Buffer): Promise { const mimes = filetypemime(data); - if (!mimes.includes("application/pdf")) { - throw new Error("Currently, only PDF files are supported."); + const validMime = mimes.find((mime) => + Object.values(SupportedFiles).includes(mime), + ); + if (!validMime) { + const supportedExtensions = Object.keys(SupportedFiles).join(", "); + throw new Error( + `File has type "${mimes}" which does not match supported MIME Types. Supported formats include: ${supportedExtensions}`, + ); } - return "application/pdf"; + + return validMime; } }