Skip to content

Commit

Permalink
Merge pull request #6 from bastienlc/dev
Browse files Browse the repository at this point in the history
Add support for PDFs with pages of different sizes
  • Loading branch information
bastienlc authored Apr 21, 2024
2 parents ed1fbda + 2ad92f4 commit af83a95
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 42 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ pip install pdf-watermark

### Usage

**TLDR**
```bash
watermark grid input.pdf "watermark text" -s output.pdf # Grid pattern for a single file
watermark insert input_folder "watermark_image.png" # Insert image for a whole directory, overwriting the input files
```

**Detailed usage**
```
Usage: watermark [OPTIONS] COMMAND [ARGS]...
Expand Down Expand Up @@ -188,6 +195,8 @@ pip install -e .
* 2.1.2
* Fix missing Poppler dependancy.
* Add test and lint to CI.
* 2.2.0
* Support PDFs with pages of different sizes.

## License

Expand Down
62 changes: 37 additions & 25 deletions src/pdf_watermark/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
GridOptions,
InsertOptions,
)
from pdf_watermark.utils import convert_content_to_images
from pdf_watermark.utils import convert_content_to_images, sort_pages


def add_watermark_to_pdf(
Expand All @@ -20,42 +20,54 @@ def add_watermark_to_pdf(
drawing_options: DrawingOptions,
specific_options: Union[GridOptions, InsertOptions],
):
pdf_writer = pypdf.PdfWriter()
pdf_to_transform = pypdf.PdfReader(input)
pdf_box = pdf_to_transform.pages[0].mediabox
page_width = pdf_box.width
page_height = pdf_box.height

with NamedTemporaryFile(delete=False) as temporary_file:
# The watermark is stored in a temporary pdf file
draw_watermarks(
temporary_file.name,
page_width,
page_height,
drawing_options,
specific_options,
)
page_sizes = []
for page in pdf_to_transform.pages:
page_sizes.append((page.mediabox.width, page.mediabox.height))

if drawing_options.unselectable and not drawing_options.save_as_image:
convert_content_to_images(
temporary_file.name, page_width, page_height, drawing_options.dpi
order = []
# Only one watermark is computed per page size
for watermark_width, watermark_height in set(page_sizes):
with NamedTemporaryFile(delete=False) as temporary_file:
# The watermark is stored in a temporary pdf file
draw_watermarks(
temporary_file.name,
watermark_width,
watermark_height,
drawing_options,
specific_options,
)

watermark_pdf = pypdf.PdfReader(temporary_file.name)
pdf_writer = pypdf.PdfWriter()
if drawing_options.unselectable and not drawing_options.save_as_image:
convert_content_to_images(
temporary_file.name,
drawing_options.dpi,
)

for page in pdf_to_transform.pages:
page.merge_page(watermark_pdf.pages[0])
pdf_writer.add_page(page)
watermark_pdf = pypdf.PdfReader(temporary_file.name)

# Remove temp file - https://stackoverflow.com/questions/23212435/permission-denied-to-write-to-my-temporary-file
temporary_file.close()
os.unlink(temporary_file.name)
# Add watermark to pages with the same size
for index, (page, (page_width, page_height)) in enumerate(
zip(pdf_to_transform.pages, page_sizes)
):
if page_width == watermark_width and page_height == watermark_height:
page.merge_page(watermark_pdf.pages[0])
pdf_writer.add_page(page)
order.append(index)

# Remove temp file - https://stackoverflow.com/questions/23212435/permission-denied-to -write-to-my-temporary-file
temporary_file.close()
os.unlink(temporary_file.name)

pdf_writer = sort_pages(pdf_writer, order)

with open(output, "wb") as f:
pdf_writer.write(f)

if drawing_options.save_as_image:
convert_content_to_images(output, page_width, page_height, drawing_options.dpi)
convert_content_to_images(output, drawing_options.dpi)


def add_watermark_from_options(
Expand Down
28 changes: 22 additions & 6 deletions src/pdf_watermark/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from io import BytesIO
from typing import Tuple
from typing import List, Tuple

import numpy as np
import pypdf
from pdf2image import convert_from_path
from pdf2image.exceptions import PopplerNotInstalledError
from reportlab.lib.utils import ImageReader
Expand Down Expand Up @@ -51,9 +52,8 @@ def fit_image(image_width, image_height, max_image_width, max_image_height, scal
return image_width, image_height


def convert_content_to_images(
file_name: str, page_width: int, page_height: int, dpi: int
):
def convert_content_to_images(file_name: str, dpi: int):
# load pages as images
try:
images = convert_from_path(file_name, dpi=dpi, fmt="png", transparent=True)
except PopplerNotInstalledError:
Expand All @@ -62,9 +62,17 @@ def convert_content_to_images(
)
return None

pdf = canvas.Canvas(file_name, pagesize=(page_width, page_height))
# get the page sizes
pdf_to_transform = pypdf.PdfReader(file_name)
page_sizes = []
for page in pdf_to_transform.pages:
page_sizes.append((page.mediabox.width, page.mediabox.height))

# create new pdf
pdf = canvas.Canvas(file_name)

for image in images:
for image, (page_width, page_height) in zip(images, page_sizes):
pdf.setPageSize((page_width, page_height))
compressed = BytesIO()
image.save(compressed, format="png", optimize=True, quality=dpi // 10)

Expand All @@ -79,3 +87,11 @@ def convert_content_to_images(
pdf.showPage()

pdf.save()


def sort_pages(pdf: pypdf.PdfWriter, order: List[int]):
output = pypdf.PdfWriter()
for index in np.argsort(order):
output.add_page(pdf.pages[int(index)])

return output
Binary file added tests/fixtures/different_sizes_input.pdf
Binary file not shown.
Binary file added tests/fixtures/different_sizes_output.pdf
Binary file not shown.
12 changes: 1 addition & 11 deletions tests/test_add_watermark_from_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@

import os

import numpy as np
import pytest
from pdf2image import convert_from_path

from pdf_watermark.handler import add_watermark_from_options
from pdf_watermark.options import (
Expand All @@ -16,6 +14,7 @@
InsertOptions,
)
from pdf_watermark.watermark import DEFAULTS
from tests.utils import assert_pdfs_are_close

INPUT = "tests/fixtures/input.pdf"
OUTPUT = "output.pdf"
Expand Down Expand Up @@ -62,15 +61,6 @@ def cleanup():
]


def assert_pdfs_are_close(path_1: str, path_2: str, epsilon: float = 1e-10):
"""This function checks that two PDFs are close enough. We chose to convert the PDFs to images and then compare their L1 norms, because other techniques (hashing for instance) might break easily."""
images_1 = convert_from_path(path_1)
images_2 = convert_from_path(path_2)

for im1, im2 in zip(images_1, images_2):
assert np.sum(np.abs(np.array(im1) - np.array(im2))) < epsilon


def test_add_watermark_from_options():
index = 0
for files_options in FILES_OPTIONS_FIXTURES:
Expand Down
53 changes: 53 additions & 0 deletions tests/test_different_page_sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Test for PDFs with different page sizes.
"""

import os

import pytest

from pdf_watermark.handler import add_watermark_from_options
from pdf_watermark.options import DrawingOptions, FilesOptions, GridOptions
from pdf_watermark.watermark import DEFAULTS
from tests.utils import assert_pdfs_are_close

INPUT = "tests/fixtures/different_sizes_input.pdf"
OUTPUT = "output.pdf"
FIXTURE = "tests/fixtures/different_sizes_output.pdf"


@pytest.fixture(autouse=True)
def cleanup():
yield
os.remove(OUTPUT)


DRAWING_OPTIONS_FIXTURE = DrawingOptions(
watermark="watermark",
opacity=DEFAULTS.opacity,
angle=DEFAULTS.angle,
text_color=DEFAULTS.text_color,
text_font=DEFAULTS.text_font,
text_size=DEFAULTS.text_size,
unselectable=DEFAULTS.unselectable,
image_scale=DEFAULTS.image_scale,
save_as_image=DEFAULTS.save_as_image,
dpi=DEFAULTS.dpi,
)

FILES_OPTIONS_FIXTURE = FilesOptions(INPUT, OUTPUT)

GRID_OPTIONS_FIXTURE = GridOptions(
horizontal_boxes=DEFAULTS.horizontal_boxes,
vertical_boxes=DEFAULTS.vertical_boxes,
margin=DEFAULTS.margin,
)


def test_different_page_sizes():
add_watermark_from_options(
files_options=FILES_OPTIONS_FIXTURE,
drawing_options=DRAWING_OPTIONS_FIXTURE,
specific_options=GRID_OPTIONS_FIXTURE,
)
assert_pdfs_are_close(OUTPUT, FIXTURE)
11 changes: 11 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import numpy as np
from pdf2image import convert_from_path


def assert_pdfs_are_close(path_1: str, path_2: str, epsilon: float = 1e-10):
"""This function checks that two PDFs are close enough. We chose to convert the PDFs to images and then compare their L1 norms, because other techniques (hashing for instance) might break easily."""
images_1 = convert_from_path(path_1)
images_2 = convert_from_path(path_2)

for im1, im2 in zip(images_1, images_2):
assert np.sum(np.abs(np.array(im1) - np.array(im2))) < epsilon

0 comments on commit af83a95

Please sign in to comment.