Merge pull request #6 from bastienlc/dev

Add support for PDFs with pages of different sizes
bastienlc · Apr 21, 2024 · af83a95 · af83a95
2 parents ed1fbda + 2ad92f4
commit af83a95
Show file tree

Hide file tree

Showing 8 changed files with 133 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -40,6 +40,13 @@ pip install pdf-watermark
 
 ### Usage
 
+**TLDR**
+```bash
+watermark grid input.pdf "watermark text" -s output.pdf # Grid pattern for a single file
+watermark insert input_folder "watermark_image.png" # Insert image for a whole directory, overwriting the input files
+```
+
+**Detailed usage**
 ```
 Usage: watermark [OPTIONS] COMMAND [ARGS]...
 
@@ -188,6 +195,8 @@ pip install -e .
 * 2.1.2
     * Fix missing Poppler dependancy.
     * Add test and lint to CI.
+* 2.2.0
+    * Support PDFs with pages of different sizes.
 
 ## License
 

diff --git a/src/pdf_watermark/handler.py b/src/pdf_watermark/handler.py
@@ -11,7 +11,7 @@
     GridOptions,
     InsertOptions,
 )
-from pdf_watermark.utils import convert_content_to_images
+from pdf_watermark.utils import convert_content_to_images, sort_pages
 
 
 def add_watermark_to_pdf(
@@ -20,42 +20,54 @@ def add_watermark_to_pdf(
     drawing_options: DrawingOptions,
     specific_options: Union[GridOptions, InsertOptions],
 ):
+    pdf_writer = pypdf.PdfWriter()
     pdf_to_transform = pypdf.PdfReader(input)
-    pdf_box = pdf_to_transform.pages[0].mediabox
-    page_width = pdf_box.width
-    page_height = pdf_box.height
 
-    with NamedTemporaryFile(delete=False) as temporary_file:
-        # The watermark is stored in a temporary pdf file
-        draw_watermarks(
-            temporary_file.name,
-            page_width,
-            page_height,
-            drawing_options,
-            specific_options,
-        )
+    page_sizes = []
+    for page in pdf_to_transform.pages:
+        page_sizes.append((page.mediabox.width, page.mediabox.height))
 
-        if drawing_options.unselectable and not drawing_options.save_as_image:
-            convert_content_to_images(
-                temporary_file.name, page_width, page_height, drawing_options.dpi
+    order = []
+    # Only one watermark is computed per page size
+    for watermark_width, watermark_height in set(page_sizes):
+        with NamedTemporaryFile(delete=False) as temporary_file:
+            # The watermark is stored in a temporary pdf file
+            draw_watermarks(
+                temporary_file.name,
+                watermark_width,
+                watermark_height,
+                drawing_options,
+                specific_options,
             )
 
-        watermark_pdf = pypdf.PdfReader(temporary_file.name)
-        pdf_writer = pypdf.PdfWriter()
+            if drawing_options.unselectable and not drawing_options.save_as_image:
+                convert_content_to_images(
+                    temporary_file.name,
+                    drawing_options.dpi,
+                )
 
-        for page in pdf_to_transform.pages:
-            page.merge_page(watermark_pdf.pages[0])
-            pdf_writer.add_page(page)
+            watermark_pdf = pypdf.PdfReader(temporary_file.name)
 
-        # Remove temp file - https://stackoverflow.com/questions/23212435/permission-denied-to-write-to-my-temporary-file
-        temporary_file.close()
-        os.unlink(temporary_file.name)
+            # Add watermark to pages with the same size
+            for index, (page, (page_width, page_height)) in enumerate(
+                zip(pdf_to_transform.pages, page_sizes)
+            ):
+                if page_width == watermark_width and page_height == watermark_height:
+                    page.merge_page(watermark_pdf.pages[0])
+                    pdf_writer.add_page(page)
+                    order.append(index)
+
+            # Remove temp file - https://stackoverflow.com/questions/23212435/permission-denied-to  -write-to-my-temporary-file
+            temporary_file.close()
+            os.unlink(temporary_file.name)
+
+    pdf_writer = sort_pages(pdf_writer, order)
 
     with open(output, "wb") as f:
         pdf_writer.write(f)
 
     if drawing_options.save_as_image:
-        convert_content_to_images(output, page_width, page_height, drawing_options.dpi)
+        convert_content_to_images(output, drawing_options.dpi)
 
 
 def add_watermark_from_options(

diff --git a/src/pdf_watermark/utils.py b/src/pdf_watermark/utils.py
@@ -1,7 +1,8 @@
 from io import BytesIO
-from typing import Tuple
+from typing import List, Tuple
 
 import numpy as np
+import pypdf
 from pdf2image import convert_from_path
 from pdf2image.exceptions import PopplerNotInstalledError
 from reportlab.lib.utils import ImageReader
@@ -51,9 +52,8 @@ def fit_image(image_width, image_height, max_image_width, max_image_height, scal
     return image_width, image_height
 
 
-def convert_content_to_images(
-    file_name: str, page_width: int, page_height: int, dpi: int
-):
+def convert_content_to_images(file_name: str, dpi: int):
+    # load pages as images
     try:
         images = convert_from_path(file_name, dpi=dpi, fmt="png", transparent=True)
     except PopplerNotInstalledError:
@@ -62,9 +62,17 @@ def convert_content_to_images(
         )
         return None
 
-    pdf = canvas.Canvas(file_name, pagesize=(page_width, page_height))
+    # get the page sizes
+    pdf_to_transform = pypdf.PdfReader(file_name)
+    page_sizes = []
+    for page in pdf_to_transform.pages:
+        page_sizes.append((page.mediabox.width, page.mediabox.height))
+
+    # create new pdf
+    pdf = canvas.Canvas(file_name)
 
-    for image in images:
+    for image, (page_width, page_height) in zip(images, page_sizes):
+        pdf.setPageSize((page_width, page_height))
         compressed = BytesIO()
         image.save(compressed, format="png", optimize=True, quality=dpi // 10)
 
@@ -79,3 +87,11 @@ def convert_content_to_images(
         pdf.showPage()
 
     pdf.save()
+
+
+def sort_pages(pdf: pypdf.PdfWriter, order: List[int]):
+    output = pypdf.PdfWriter()
+    for index in np.argsort(order):
+        output.add_page(pdf.pages[int(index)])
+
+    return output
diff --git a/tests/fixtures/different_sizes_input.pdf b/tests/fixtures/different_sizes_input.pdf
diff --git a/tests/fixtures/different_sizes_output.pdf b/tests/fixtures/different_sizes_output.pdf
diff --git a/tests/test_add_watermark_from_options.py b/tests/test_add_watermark_from_options.py
@@ -4,9 +4,7 @@
 
 import os
 
-import numpy as np
 import pytest
-from pdf2image import convert_from_path
 
 from pdf_watermark.handler import add_watermark_from_options
 from pdf_watermark.options import (
@@ -16,6 +14,7 @@
     InsertOptions,
 )
 from pdf_watermark.watermark import DEFAULTS
+from tests.utils import assert_pdfs_are_close
 
 INPUT = "tests/fixtures/input.pdf"
 OUTPUT = "output.pdf"
@@ -62,15 +61,6 @@ def cleanup():
 ]
 
 
-def assert_pdfs_are_close(path_1: str, path_2: str, epsilon: float = 1e-10):
-    """This function checks that two PDFs are close enough. We chose to convert the PDFs to images and then compare their L1 norms, because other techniques (hashing for instance) might break easily."""
-    images_1 = convert_from_path(path_1)
-    images_2 = convert_from_path(path_2)
-
-    for im1, im2 in zip(images_1, images_2):
-        assert np.sum(np.abs(np.array(im1) - np.array(im2))) < epsilon
-
-
 def test_add_watermark_from_options():
     index = 0
     for files_options in FILES_OPTIONS_FIXTURES:

diff --git a/tests/test_different_page_sizes.py b/tests/test_different_page_sizes.py
@@ -0,0 +1,53 @@
+"""
+Test for PDFs with different page sizes.
+"""
+
+import os
+
+import pytest
+
+from pdf_watermark.handler import add_watermark_from_options
+from pdf_watermark.options import DrawingOptions, FilesOptions, GridOptions
+from pdf_watermark.watermark import DEFAULTS
+from tests.utils import assert_pdfs_are_close
+
+INPUT = "tests/fixtures/different_sizes_input.pdf"
+OUTPUT = "output.pdf"
+FIXTURE = "tests/fixtures/different_sizes_output.pdf"
+
+
+@pytest.fixture(autouse=True)
+def cleanup():
+    yield
+    os.remove(OUTPUT)
+
+
+DRAWING_OPTIONS_FIXTURE = DrawingOptions(
+    watermark="watermark",
+    opacity=DEFAULTS.opacity,
+    angle=DEFAULTS.angle,
+    text_color=DEFAULTS.text_color,
+    text_font=DEFAULTS.text_font,
+    text_size=DEFAULTS.text_size,
+    unselectable=DEFAULTS.unselectable,
+    image_scale=DEFAULTS.image_scale,
+    save_as_image=DEFAULTS.save_as_image,
+    dpi=DEFAULTS.dpi,
+)
+
+FILES_OPTIONS_FIXTURE = FilesOptions(INPUT, OUTPUT)
+
+GRID_OPTIONS_FIXTURE = GridOptions(
+    horizontal_boxes=DEFAULTS.horizontal_boxes,
+    vertical_boxes=DEFAULTS.vertical_boxes,
+    margin=DEFAULTS.margin,
+)
+
+
+def test_different_page_sizes():
+    add_watermark_from_options(
+        files_options=FILES_OPTIONS_FIXTURE,
+        drawing_options=DRAWING_OPTIONS_FIXTURE,
+        specific_options=GRID_OPTIONS_FIXTURE,
+    )
+    assert_pdfs_are_close(OUTPUT, FIXTURE)
diff --git a/tests/utils.py b/tests/utils.py
@@ -0,0 +1,11 @@
+import numpy as np
+from pdf2image import convert_from_path
+
+
+def assert_pdfs_are_close(path_1: str, path_2: str, epsilon: float = 1e-10):
+    """This function checks that two PDFs are close enough. We chose to convert the PDFs to images and then compare their L1 norms, because other techniques (hashing for instance) might break easily."""
+    images_1 = convert_from_path(path_1)
+    images_2 = convert_from_path(path_2)
+
+    for im1, im2 in zip(images_1, images_2):
+        assert np.sum(np.abs(np.array(im1) - np.array(im2))) < epsilon