From 62c08ffa05bbe718b05717976b7e01a8b7ddd090 Mon Sep 17 00:00:00 2001 From: Sergey Date: Tue, 12 Nov 2024 15:48:33 -0700 Subject: [PATCH 1/3] parsing working --- src/openparse/schemas.py | 2 - src/openparse/text/pdfminer/core.py | 160 ++++++++++++++++++++++------ 2 files changed, 126 insertions(+), 36 deletions(-) diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py index 888240a..ad5ae94 100644 --- a/src/openparse/schemas.py +++ b/src/openparse/schemas.py @@ -360,7 +360,6 @@ class ImageElement(BaseModel): def embed_text(self) -> str: if self._embed_text: return self._embed_text - return self.text @cached_property @@ -381,7 +380,6 @@ def is_at_similar_height( error_margin: float = 1, ) -> bool: y_distance = abs(self.bbox.y1 - other.bbox.y1) - return y_distance <= error_margin diff --git a/src/openparse/text/pdfminer/core.py b/src/openparse/text/pdfminer/core.py index 980b791..bfae247 100644 --- a/src/openparse/text/pdfminer/core.py +++ b/src/openparse/text/pdfminer/core.py @@ -10,7 +10,9 @@ LTTextContainer, LTTextLine, ) +from pdfminer.pdftypes import resolve1 from pdfminer.psparser import PSLiteral +from PIL import Image from pydantic import BaseModel, model_validator from openparse.pdf import Pdf @@ -65,21 +67,32 @@ def _extract_chars(text_line: LTTextLine) -> List[CharElement]: def get_mime_type(pdf_object: LTImage) -> Optional[str]: - subtype = pdf_object.stream.attrs.get("Subtype", PSLiteral(None)).name - filter_ = pdf_object.stream.attrs.get("Filter", PSLiteral(None)).name - if subtype == "Image": - if filter_ == "DCTDecode": - return "image/jpeg" - elif filter_ == "FlateDecode": - return "image/png" # Most likely, but could also be TIFF - elif filter_ == "JPXDecode": - return "image/jp2" - elif filter_ == "CCITTFaxDecode": - return "image/tiff" - elif filter_ == "JBIG2Decode": - return "image/jbig2" - - return None + """Determine the MIME type of an image in a PDF based on its filters.""" + # Resolve the stream attributes + stream_attrs = pdf_object.stream.attrs + subtype = resolve1(stream_attrs.get("Subtype", "")) + filters = resolve1(stream_attrs.get("Filter", "")) + + if isinstance(filters, list): + filter_names = [str(f).lstrip("/").strip("\"'") for f in filters] + else: + filter_names = [str(filters).lstrip("/").strip("\"'")] if filters else [] + + mime_type = None + if "DCTDecode" in filter_names: + mime_type = "image/jpeg" + elif "JPXDecode" in filter_names: + mime_type = "image/jp2" + elif "FlateDecode" in filter_names: + # Could be PNG or TIFF; may need additional information + # For simplicity, assume PNG + mime_type = "image/png" + elif "CCITTFaxDecode" in filter_names: + mime_type = "image/tiff" + elif "JBIG2Decode" in filter_names: + mime_type = "image/jbig2" + + return mime_type def _group_chars_into_spans(chars: Iterable[CharElement]) -> List[TextSpan]: @@ -143,6 +156,75 @@ def _get_bbox(lines: List[LineElement]) -> Tuple[float, float, float, float]: return x0, y0, x1, y1 +def group_overlapping_images( + image_elements: List[ImageElement], buffer: float = 1.0 +) -> List[List[ImageElement]]: + """Group images that overlap or are adjacent.""" + groups = [] + used = set() + + for i, elem1 in enumerate(image_elements): + if i in used: + continue + group = [elem1] + used.add(i) + for j, elem2 in enumerate(image_elements[i + 1 :], start=i + 1): + if j in used: + continue + if any(elem2.overlaps(e, buffer=buffer) for e in group): + group.append(elem2) + used.add(j) + groups.append(group) + return groups + + +def _process_png_image(e: LTImage) -> Optional[bytes]: + try: + # Extract image attributes + width = e.stream.attrs.get("Width") + height = e.stream.attrs.get("Height") + color_space = e.stream.attrs.get("ColorSpace", "DeviceRGB") + + # Resolve indirect references in ColorSpace + color_space = resolve1(color_space) + + # Ensure color_space is a string + if isinstance(color_space, list): + color_space = color_space[0] + if isinstance(color_space, PSLiteral): + color_space = color_space.name + if isinstance(color_space, str): + color_space = color_space.strip("/") + else: + print(f"Unsupported color space type: {type(color_space)}") + return None + + # Map PDF color space to PIL mode + if color_space == "DeviceRGB": + mode = "RGB" + elif color_space == "DeviceGray": + mode = "L" + elif color_space == "DeviceCMYK": + mode = "CMYK" + else: + print(f"Unsupported color space: {color_space}") + return None + + # Get the image data after filters have been applied + img_data = e.stream.get_data() + + # Create image using Pillow + img = Image.frombytes(mode, (width, height), img_data) + + # Convert to PNG bytes + output = BytesIO() + img.save(output, format="PNG") + return output.getvalue() + except Exception as ex: + print(f"Error processing PNG image: {ex}") + return None + + def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: """Parse PDF and return a list of TextElement and ImageElement objects.""" elements = [] @@ -151,6 +233,7 @@ def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: for page_num, page_layout in enumerate(page_layouts): page_width = page_layout.width page_height = page_layout.height + text_elements = [] for element in page_layout: if isinstance(element, LTTextContainer): lines = [] @@ -161,7 +244,7 @@ def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: continue bbox = _get_bbox(lines) - elements.append( + text_elements.append( TextElement( bbox=Bbox( x0=bbox[0], @@ -177,26 +260,35 @@ def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: ) ) elif isinstance(element, LTFigure): - for e in element._objs: + for e in element: if isinstance(e, LTImage): mime_type = get_mime_type(e) if mime_type: - img_data = BytesIO(e.stream.get_data()).getvalue() - base64_string = base64.b64encode(img_data).decode("utf-8") - elements.append( - ImageElement( - bbox=Bbox( - x0=e.bbox[0], - y0=e.bbox[1], - x1=e.bbox[2], - y1=e.bbox[3], - page=page_num, - page_width=page_width, - page_height=page_height, - ), - image=base64_string, - image_mimetype=mime_type or "unknown", - text="", + if mime_type == "image/png": + img_data = _process_png_image(e) + else: + img_data = e.stream.get_data() + if img_data: + base64_string = base64.b64encode(img_data).decode( + "utf-8" ) - ) + elements.append( + ImageElement( + bbox=Bbox( + x0=e.bbox[0], + y0=e.bbox[1], + x1=e.bbox[2], + y1=e.bbox[3], + page=page_num, + page_width=page_width, + page_height=page_height, + ), + image=base64_string, + image_mimetype=mime_type or "unknown", + text="", + ) + ) + + # Add text elements + elements.extend(text_elements) return elements From 444e9475b5d21dceb7c67483fa118bc9f8923e4a Mon Sep 17 00:00:00 2001 From: Sergey Date: Tue, 12 Nov 2024 17:23:22 -0700 Subject: [PATCH 2/3] baseline working --- src/openparse/processing/basic_transforms.py | 103 ++++++++++++++++++- src/openparse/processing/ingest.py | 3 + src/openparse/schemas.py | 27 ++++- src/openparse/text/pdfminer/core.py | 43 ++------ 4 files changed, 140 insertions(+), 36 deletions(-) diff --git a/src/openparse/processing/basic_transforms.py b/src/openparse/processing/basic_transforms.py index d78efb1..ac54ca7 100644 --- a/src/openparse/processing/basic_transforms.py +++ b/src/openparse/processing/basic_transforms.py @@ -1,8 +1,12 @@ +import base64 +import io from abc import ABC, abstractmethod from collections import defaultdict from typing import Dict, List, Literal -from openparse.schemas import Bbox, Node, TextElement +from PIL import Image + +from openparse.schemas import Bbox, ImageElement, Node, TextElement class ProcessingStep(ABC): @@ -14,6 +18,96 @@ def process(self, nodes: List[Node]) -> List[Node]: raise NotImplementedError("Subclasses must implement this method.") +class CombineSlicedImages(ProcessingStep): + """ + PDF will slice images into multiple pieces if they are too large. This combines them back together. + """ + + def _combine_images_in_group( + self, image_elements: List[ImageElement] + ) -> ImageElement: + """Combine a list of ImageElements into a single ImageElement.""" + if not image_elements: + raise ValueError("No images to combine.") + + images = [] + for node in image_elements: + image_data = base64.b64decode(node.image) + image = Image.open(io.BytesIO(image_data)) + image = image.rotate(180) + images.append(image) + + # Determine the width and total height of the final image + width = max(img.width for img in images) + total_height = sum(img.height for img in images) + + # Create a new blank image + new_image = Image.new("RGB", (width, total_height)) + + # Paste images one below the other + y_offset = 0 + for img in images: + new_image.paste(img, (0, y_offset)) + y_offset += img.height + + # Save or encode the final image + buffered = io.BytesIO() + new_image.save(buffered, format="PNG") + final_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + + return ImageElement( + bbox=image_elements[0].bbox, + image=final_base64, + image_mimetype="image/png", + text="", + ) + + def _group_overlapping_images( + self, image_elements: List[ImageElement], buffer: float = 1.0 + ) -> List[List[ImageElement]]: + """Group images that overlap or are adjacent.""" + groups = [] + used = set() + + for i, elem1 in enumerate(image_elements): + if i in used: + continue + group = [elem1] + used.add(i) + queue = [elem1] + while queue: + current = queue.pop() + for j, elem2 in enumerate(image_elements): + if j in used: + continue + if current.overlaps(elem2, buffer=buffer): + group.append(elem2) + used.add(j) + queue.append(elem2) + groups.append(group) + return groups + + def process(self, nodes: List[Node]) -> List[Node]: + nodes_by_page: Dict[int, List[Node]] = defaultdict(list) + for node in nodes: + pages = {element.bbox.page for element in node.elements} + for page in pages: + nodes_by_page[page].append(node) + + new_nodes = [] + for page, page_nodes in nodes_by_page.items(): + image_nodes = [e for e in page_nodes if e.variant == {"image"}] + if image_nodes: + image_elements: List[ImageElement] = [ + sub_e for e in image_nodes for sub_e in e.elements + ] # type: ignore + combined_image = self._combine_images_in_group(image_elements) + new_nodes.append(Node(elements=(combined_image,))) + else: + new_nodes.extend(page_nodes) + return new_nodes + + class RemoveTextInsideTables(ProcessingStep): """ If we're using the table extraction pipeline, we need to remove text that is inside tables to avoid duplication. @@ -162,7 +256,12 @@ def __init__(self, min_tokens: int): self.min_tokens = min_tokens def process(self, nodes: List[Node]) -> List[Node]: - return [node for node in nodes if node.tokens >= self.min_tokens] + res = [] + for node in nodes: + if node.tokens <= self.min_tokens and "image" not in node.variant: + continue + res.append(node) + return res class CombineNodesSpatially(ProcessingStep): diff --git a/src/openparse/processing/ingest.py b/src/openparse/processing/ingest.py index a08f84b..c7d8df4 100644 --- a/src/openparse/processing/ingest.py +++ b/src/openparse/processing/ingest.py @@ -6,6 +6,7 @@ CombineBullets, CombineHeadingsWithClosestText, CombineNodesSpatially, + CombineSlicedImages, ProcessingStep, RemoveFullPageStubs, RemoveMetadataElements, @@ -69,6 +70,7 @@ class BasicIngestionPipeline(IngestionPipeline): def __init__(self): self.transformations = [ RemoveTextInsideTables(), + CombineSlicedImages(), RemoveFullPageStubs(max_area_pct=0.35), # mostly aimed at combining bullets and weird formatting CombineNodesSpatially( @@ -106,6 +108,7 @@ def __init__( self.transformations = [ RemoveTextInsideTables(), + CombineSlicedImages(), RemoveFullPageStubs(max_area_pct=0.35), # mostly aimed at combining bullets and weird formatting CombineNodesSpatially( diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py index ad5ae94..0476f87 100644 --- a/src/openparse/schemas.py +++ b/src/openparse/schemas.py @@ -382,6 +382,18 @@ def is_at_similar_height( y_distance = abs(self.bbox.y1 - other.bbox.y1) return y_distance <= error_margin + def overlaps(self, other: "ImageElement", buffer: float = 1.0) -> bool: + """Check if this image overlaps or is adjacent to another image, considering a buffer.""" + if self.bbox.page != other.bbox.page: + return False + + return not ( + self.bbox.x1 + buffer < other.bbox.x0 - buffer + or self.bbox.x0 - buffer > other.bbox.x1 + buffer + or self.bbox.y1 + buffer < other.bbox.y0 - buffer + or self.bbox.y0 - buffer > other.bbox.y1 + buffer + ) + ############# ### NODES ### @@ -639,7 +651,20 @@ def _repr_markdown_(self): """ When called in a Jupyter environment, this will display the node as Markdown, which Jupyter will then render as HTML. """ - return self.text + markdown_parts = [] + for element in self.elements: + if element.variant == NodeVariant.TEXT: + markdown_parts.append(element.text) + elif element.variant == NodeVariant.IMAGE: + image_data = element.image + mime_type = element.image_mimetype + if mime_type == "unknown": + mime_type = "image/png" + markdown_image = f"![Image](data:{mime_type};base64,{image_data})" + markdown_parts.append(markdown_image) + elif element.variant == NodeVariant.TABLE: + markdown_parts.append(element.text) + return "\n\n".join(markdown_parts) def __add__(self, other: "Node") -> "Node": """ diff --git a/src/openparse/text/pdfminer/core.py b/src/openparse/text/pdfminer/core.py index bfae247..b34adae 100644 --- a/src/openparse/text/pdfminer/core.py +++ b/src/openparse/text/pdfminer/core.py @@ -1,4 +1,5 @@ import base64 +import logging from io import BytesIO from typing import Any, Iterable, List, Optional, Tuple, Union @@ -66,7 +67,7 @@ def _extract_chars(text_line: LTTextLine) -> List[CharElement]: return chars -def get_mime_type(pdf_object: LTImage) -> Optional[str]: +def _get_mime_type(pdf_object: LTImage) -> Optional[str]: """Determine the MIME type of an image in a PDF based on its filters.""" # Resolve the stream attributes stream_attrs = pdf_object.stream.attrs @@ -156,33 +157,11 @@ def _get_bbox(lines: List[LineElement]) -> Tuple[float, float, float, float]: return x0, y0, x1, y1 -def group_overlapping_images( - image_elements: List[ImageElement], buffer: float = 1.0 -) -> List[List[ImageElement]]: - """Group images that overlap or are adjacent.""" - groups = [] - used = set() - - for i, elem1 in enumerate(image_elements): - if i in used: - continue - group = [elem1] - used.add(i) - for j, elem2 in enumerate(image_elements[i + 1 :], start=i + 1): - if j in used: - continue - if any(elem2.overlaps(e, buffer=buffer) for e in group): - group.append(elem2) - used.add(j) - groups.append(group) - return groups - - def _process_png_image(e: LTImage) -> Optional[bytes]: try: # Extract image attributes - width = e.stream.attrs.get("Width") - height = e.stream.attrs.get("Height") + width = e.stream.attrs.get("Width", 0) + height = e.stream.attrs.get("Height", 0) color_space = e.stream.attrs.get("ColorSpace", "DeviceRGB") # Resolve indirect references in ColorSpace @@ -207,7 +186,7 @@ def _process_png_image(e: LTImage) -> Optional[bytes]: elif color_space == "DeviceCMYK": mode = "CMYK" else: - print(f"Unsupported color space: {color_space}") + logging.info(f"Unsupported color space: {color_space}") return None # Get the image data after filters have been applied @@ -233,7 +212,7 @@ def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: for page_num, page_layout in enumerate(page_layouts): page_width = page_layout.width page_height = page_layout.height - text_elements = [] + page_elements = [] for element in page_layout: if isinstance(element, LTTextContainer): lines = [] @@ -244,7 +223,7 @@ def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: continue bbox = _get_bbox(lines) - text_elements.append( + page_elements.append( TextElement( bbox=Bbox( x0=bbox[0], @@ -262,7 +241,7 @@ def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: elif isinstance(element, LTFigure): for e in element: if isinstance(e, LTImage): - mime_type = get_mime_type(e) + mime_type = _get_mime_type(e) if mime_type: if mime_type == "image/png": img_data = _process_png_image(e) @@ -272,7 +251,7 @@ def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: base64_string = base64.b64encode(img_data).decode( "utf-8" ) - elements.append( + page_elements.append( ImageElement( bbox=Bbox( x0=e.bbox[0], @@ -288,7 +267,5 @@ def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: text="", ) ) - - # Add text elements - elements.extend(text_elements) + elements.extend(page_elements) return elements From dd539691e7f31a819f7e7200cebdd72ae75bf892 Mon Sep 17 00:00:00 2001 From: Sergey Date: Tue, 12 Nov 2024 18:00:58 -0700 Subject: [PATCH 3/3] kinda working --- src/cookbooks/images.ipynb | 0 src/openparse/processing/basic_transforms.py | 25 ++++++++++++++------ src/openparse/text/pdfminer/core.py | 10 ++++---- 3 files changed, 24 insertions(+), 11 deletions(-) create mode 100644 src/cookbooks/images.ipynb diff --git a/src/cookbooks/images.ipynb b/src/cookbooks/images.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/src/openparse/processing/basic_transforms.py b/src/openparse/processing/basic_transforms.py index ac54ca7..5070c1d 100644 --- a/src/openparse/processing/basic_transforms.py +++ b/src/openparse/processing/basic_transforms.py @@ -2,11 +2,22 @@ import io from abc import ABC, abstractmethod from collections import defaultdict -from typing import Dict, List, Literal +from typing import Dict, List, Literal, Type, TypeVar from PIL import Image -from openparse.schemas import Bbox, ImageElement, Node, TextElement +from openparse.schemas import Bbox, ImageElement, Node, TableElement, TextElement + +E = TypeVar("E", TextElement, ImageElement, TableElement) + + +def get_elements_of_type(nodes: List[Node], element_type: Type[E]) -> List[E]: + elements: List[E] = [] + for node in nodes: + for element in node.elements: + if isinstance(element, element_type): + elements.append(element) + return elements class ProcessingStep(ABC): @@ -34,7 +45,7 @@ def _combine_images_in_group( for node in image_elements: image_data = base64.b64decode(node.image) image = Image.open(io.BytesIO(image_data)) - image = image.rotate(180) + # image = image.rotate(180) images.append(image) # Determine the width and total height of the final image @@ -98,11 +109,11 @@ def process(self, nodes: List[Node]) -> List[Node]: for page, page_nodes in nodes_by_page.items(): image_nodes = [e for e in page_nodes if e.variant == {"image"}] if image_nodes: - image_elements: List[ImageElement] = [ - sub_e for e in image_nodes for sub_e in e.elements - ] # type: ignore + image_elements = get_elements_of_type(image_nodes, ImageElement) + text_elements = get_elements_of_type(page_nodes, TextElement) + combined_image = self._combine_images_in_group(image_elements) - new_nodes.append(Node(elements=(combined_image,))) + new_nodes.append(Node(elements=(combined_image, *text_elements))) else: new_nodes.extend(page_nodes) return new_nodes diff --git a/src/openparse/text/pdfminer/core.py b/src/openparse/text/pdfminer/core.py index b34adae..cace2d8 100644 --- a/src/openparse/text/pdfminer/core.py +++ b/src/openparse/text/pdfminer/core.py @@ -157,15 +157,17 @@ def _get_bbox(lines: List[LineElement]) -> Tuple[float, float, float, float]: return x0, y0, x1, y1 -def _process_png_image(e: LTImage) -> Optional[bytes]: +def _process_png_image(e: LTImage, page_rotation: int = 0) -> Optional[bytes]: try: # Extract image attributes width = e.stream.attrs.get("Width", 0) height = e.stream.attrs.get("Height", 0) color_space = e.stream.attrs.get("ColorSpace", "DeviceRGB") + decode = e.stream.attrs.get("Decode", None) - # Resolve indirect references in ColorSpace + # Resolve indirect references color_space = resolve1(color_space) + decode = resolve1(decode) # Ensure color_space is a string if isinstance(color_space, list): @@ -175,7 +177,7 @@ def _process_png_image(e: LTImage) -> Optional[bytes]: if isinstance(color_space, str): color_space = color_space.strip("/") else: - print(f"Unsupported color space type: {type(color_space)}") + logging.info(f"Unsupported color space type: {type(color_space)}") return None # Map PDF color space to PIL mode @@ -200,7 +202,7 @@ def _process_png_image(e: LTImage) -> Optional[bytes]: img.save(output, format="PNG") return output.getvalue() except Exception as ex: - print(f"Error processing PNG image: {ex}") + logging.error(f"Error processing PNG image: {ex}") return None