Merge pull request #42 from QuivrHQ/feat/megaparse-llm

add: llm megaparser
QuivrHQ · Jun 26, 2024 · 3365699 · 3365699
2 parents 58b70f5 + a0ab0ba
commit 3365699
Show file tree

Hide file tree

Showing 5 changed files with 263 additions and 69 deletions.
diff --git a/README.md b/README.md
@@ -63,10 +63,11 @@ print(content)
 <!---BENCHMARK-->
 | Parser | Diff |
 |---|---|
-| Megaparse with LLamaParse and GPTCleaner | 84 |
-| **Megaparse** | 100 |
-| Megaparse with LLamaParse | 104 |
-| LLama Parse | 108 |
+| LMM megaparse | 39 |
+| Megaparse with LLamaParse and GPTCleaner | 74 |
+| Megaparse with LLamaParse | 97 |
+| LLama Parse | 102 |
+| **Megaparse** | 105 |
 <!---END_BENCHMARK-->
 
 *Lower is better*

diff --git a/megaparse/Converter.py b/megaparse/Converter.py
@@ -1,4 +1,5 @@
 import asyncio
+from enum import Enum
 import os
 from docx.document import Document as DocumentObject
 from docx import Document
@@ -23,6 +24,7 @@
 from llama_index.core import download_loader
 from unstructured.partition.auto import partition
 import pandas as pd
+from megaparse.multimodal_convertor.megaparse_vision import MegaParseVision
 
 
 class Converter:
@@ -227,16 +229,30 @@ def save_md(self, md_content: str, file_path: Path | str) -> None:
             f.write(md_content)
 
 
+class MethodEnum(str, Enum):
+    """Method to use for the conversion"""
+    LLAMA_PARSE = "llama_parse"
+    UNSTRUCTURED = "unstructured"
+    MEGAPARSE_VISION = "megaparse_vision"
+
+
 class PDFConverter:
     def __init__(
         self,
         llama_parse_api_key: str,
+        method: MethodEnum | str = MethodEnum.UNSTRUCTURED,
         handle_pagination: bool = True,
         handle_header: bool = True,
     ) -> None:
         self.handle_pagination = handle_pagination
         self.handle_header = handle_header
         self.llama_parse_api_key = llama_parse_api_key
+        if isinstance(method, str):
+            try:
+                method = MethodEnum(method)
+            except ValueError:
+                raise ValueError(f"Method {method} not supported")
+        self.method = method
 
     async def _llama_parse(self, api_key: str, file_path: str):
         parsing_instructions = "Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables."
@@ -255,13 +271,22 @@ async def _llama_parse(self, api_key: str, file_path: str):
     def _unstructured_parse(self, file_path: str):
         unstructured_parser = UnstructuredParser()
         return unstructured_parser.convert(file_path)
+
+    async def _lmm_parse(self, file_path: str):
+        lmm_parser = MegaParseVision()
+        return await lmm_parser.parse(file_path)
 
     async def convert(self, file_path: str, gpt4o_cleaner=False) -> str:
         parsed_md = ""
-        if self.llama_parse_api_key:
+        if self.method == MethodEnum.LLAMA_PARSE:
+            assert self.llama_parse_api_key is not None, "LLama Parse API key is required for this method"
             parsed_md = await self._llama_parse(self.llama_parse_api_key, file_path)
-        else:
+        elif self.method == MethodEnum.MEGAPARSE_VISION:
+            parsed_md = await self._lmm_parse(file_path)
+        elif self.method == MethodEnum.UNSTRUCTURED:
             parsed_md = self._unstructured_parse(file_path)
+        else:
+            raise ValueError(f"Method {self.method} not supported")
 
         if not (self.handle_pagination or self.handle_header):
             return parsed_md
@@ -284,23 +309,6 @@ def __init__(self, file_path: str, llama_parse_api_key: str | None = None) -> No
         self.file_path = file_path
         self.llama_parse_api_key = llama_parse_api_key
 
-    # def convert(self, **kwargs) -> str:
-    #     file_extension: str = os.path.splitext(self.file_path)[1]
-    #     if file_extension == ".docx":
-    #         converter = DOCXConverter(
-    #             file_path=self.file_path, file_extension=file_extension
-    #         )
-    #     elif file_extension == ".pptx":
-    #         converter = PPTXConverter(
-    #             file_path=self.file_path, file_extension=file_extension
-    #         )
-    #     elif file_extension == ".pdf":
-    #         converter = PDFConverter(llama_parse_api_key=self.llama_parse_api_key)
-    #     else:
-    #         print(self.file_path, file_extension)
-    #         raise ValueError(f"Unsupported file extension: {file_extension}")
-    #     return converter.convert(self.file_path, **kwargs)
-
     def convert(self, **kwargs) -> str:
         file_extension: str = os.path.splitext(self.file_path)[1]
         if file_extension == ".docx":

diff --git a/megaparse/multimodal_convertor/megaparse_vision.py b/megaparse/multimodal_convertor/megaparse_vision.py
@@ -0,0 +1,182 @@
+from enum import Enum
+from io import BytesIO
+from typing import List
+from pypdf import PdfReader, PdfWriter
+from langchain_core.messages import HumanMessage
+from langchain_openai import ChatOpenAI
+import base64
+from pdf2image import convert_from_path
+import asyncio
+import re
+
+# BASE_OCR_PROMPT = """
+# Transcribe the content of this file into markdown. Be mindful of the formatting. 
+# Add formatting if you think it is not clear.
+# Do not include page breaks and merge content of tables if it is continued in the next page. 
+# Add tags around what you identify as a table [TABLE], header - complete chain of characters that are repeated at each start of pages - [HEADER], table of content [TOC] in the format '[tag] ... [/tag]'
+# Return only the parsed content.
+# """
+
+BASE_OCR_PROMPT = """
+You are tasked with transcribing and formatting the content of a file into markdown. Your goal is to create a well-structured, readable markdown document that accurately represents the original content while adding appropriate formatting and tags.
+
+
+Follow these instructions to complete the task:
+
+1. Carefully read through the entire file content.
+
+2. Transcribe the content into markdown format, paying close attention to the existing formatting and structure.
+
+3. If you encounter any unclear formatting in the original content, use your judgment to add appropriate markdown formatting to improve readability and structure.
+
+4. For tables, headers, and table of contents, add the following tags:
+   - Tables: Enclose the entire table in [TABLE] and [/TABLE] tags. Merge content of tables if it is continued in the next page.
+   - Headers (complete chain of characters repeated at the start of each page): Enclose in [HEADER] and [/HEADER] tags inside the markdown file.
+   - Table of contents: Enclose in [TOC] and [/TOC] tags
+
+5. When transcribing tables:
+   - If a table continues across multiple pages, merge the content into a single, cohesive table.
+   - Use proper markdown table formatting with pipes (|) and hyphens (-) for table structure.
+
+6. Do not include page breaks in your transcription.
+
+7. Maintain the logical flow and structure of the document, ensuring that sections and subsections are properly formatted using markdown headers (# for main headers, ## for subheaders, etc.).
+
+8. Use appropriate markdown syntax for other formatting elements such as bold, italic, lists, and code blocks as needed.
+
+10. Return only the parsed content in markdown format, including the specified tags for tables, headers, and table of contents.
+"""
+
+class ModelEnum(str, Enum):
+    """Model to use for the conversion"""
+    CLAUDE = "claude-3.5"
+    GPT4O = "gpt-4o"
+
+class TagEnum(str, Enum):
+    """Possible tags for the elements in the file"""
+    TABLE = "TABLE"
+    TOC = "TOC"
+    HEADER = "HEADER"
+    IMAGE = "IMAGE"
+
+
+class MegaParseVision:
+    def __init__(self, model: ModelEnum = ModelEnum.GPT4O):
+        if model == ModelEnum.GPT4O:
+            self.model = ChatOpenAI(model="gpt-4o")
+        elif model == ModelEnum.CLAUDE:
+            raise NotImplementedError("Claude support not yet implemented")
+        else:
+            raise ValueError(f"Model {model} not supported")
+
+        self.parsed_chunks: list[str]| None = None
+
+    def process_file(self, file_path: str, image_format: str = 'PNG') -> List[str]:
+        """
+        Process a PDF file and convert its pages to base64 encoded images.
+        
+        :param file_path: Path to the PDF file
+        :param image_format: Format to save the images (default: PNG)
+        :return: List of base64 encoded images
+        """
+        try:
+            images = convert_from_path(file_path)
+            images_base64 = []
+            for image in images:
+                buffered = BytesIO()
+                image.save(buffered, format=image_format)
+                image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                images_base64.append(image_base64)
+            return images_base64
+        except Exception as e:
+            raise ValueError(f"Error processing PDF file: {str(e)}")
+
+    def get_element(self, tag : TagEnum, chunk: str):
+        pattern = rf'\[{tag.value}\]([\s\S]*?)\[/{tag.value}\]'
+        all_elmts = re.findall(pattern, chunk)
+        if not all_elmts:
+            print(f"No {tag.value} found in the chunk")
+            return []
+        return [elmt.strip() for elmt in all_elmts]
+
+    async def send_to_mlm(self, images_data: List[str]) -> str:
+        """
+        Send images to the language model for processing.
+        
+        :param images_data: List of base64 encoded images
+        :return: Processed content as a string
+        """
+        images_prompt = [
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
+            } for image_data in images_data
+        ]
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": BASE_OCR_PROMPT},
+                *images_prompt,
+            ],
+        )
+        response = await self.model.ainvoke([message])
+        return str(response.content)
+
+    async def parse(self, file_path: str, batch_size: int = 3) -> str:
+        """
+        Parse a PDF file and process its content using the language model.
+        
+        :param file_path: Path to the PDF file
+        :param batch_size: Number of pages to process concurrently
+        :return: List of processed content strings
+        """
+        pdf_base64 = self.process_file(file_path)
+        tasks = [self.send_to_mlm(pdf_base64[i:i+batch_size]) for i in range(0, len(pdf_base64), batch_size)]
+        self.parsed_chunks = await asyncio.gather(*tasks)
+        responses = self.get_cleaned_content("\n".join(self.parsed_chunks))
+        return responses
+
+    def get_cleaned_content(self, parsed_file: str) -> str:
+        """
+        Get cleaned parsed file without any tags defined in TagEnum.
+
+        This method removes all tags from TagEnum from the parsed file, formats the content,
+        and handles the HEADER tag specially by keeping only the first occurrence.
+
+        Args:
+            parsed_file (str): The parsed file content with tags.
+
+        Returns:
+            str: The cleaned content without TagEnum tags.
+
+        """
+        tag_pattern = '|'.join(map(re.escape, TagEnum.__members__.values()))
+        tag_regex = rf'\[({tag_pattern})\](.*?)\[/\1\]'
+        # handle the HEADER tag specially
+        header_pattern = rf'\[{TagEnum.HEADER.value}\](.*?)\[/{TagEnum.HEADER.value}\]'
+        headers = re.findall(header_pattern, parsed_file, re.DOTALL)
+        if headers:
+            first_header = headers[0].strip()
+            # Remove all HEADER tags and their content
+            parsed_file = re.sub(header_pattern, '', parsed_file, flags=re.DOTALL)
+            # Add the first header back at the beginning
+            parsed_file = f"{first_header}\n{parsed_file}"
+
+        # Remove all other tags
+        def remove_tag(match):
+            return match.group(2)
+        cleaned_content = re.sub(tag_regex, remove_tag, parsed_file, flags=re.DOTALL)
+
+        cleaned_content = re.sub(r'^```.*$\n?', '', cleaned_content, flags=re.MULTILINE)
+        cleaned_content = re.sub(r'\n\s*\n', '\n\n', cleaned_content)
+        cleaned_content = cleaned_content.replace("|\n\n|", "|\n|")
+        cleaned_content = cleaned_content.strip()
+
+        return cleaned_content
+
+
+if __name__ == "__main__":
+    parser = MegaParseVision()
+    responses = asyncio.run(parser.parse("megaparse/tests/input_tests/MegaFake_report.pdf"))
+    print(responses)
+    print("Done!")
+