Skip to content

Commit

Permalink
Add renderpdf.c to library
Browse files Browse the repository at this point in the history
* This implements a function used in several programs, that takes a
  set of input pdf files that are assumed to be page images,
  and renders every page in a temp directory at a requested resolution.
* Modified prog/cleanpdf to use this function; checked with previous
  implementation; used valgrind.
* The new implementation is better because it writes the output files
  in a temp directory that is cleaned out with each invocation of
  the function.  In the previous implementation, it was necessary to
  remove previously rendered files by hand.
* The new implementation allows output images to be rendered at
  resolutions between 50 and 300 ppi, independent of the actual
  resolution of the input images wrapped in the pdf files.  This is
  done by assuming the input pages are 612 x 792 printer points.
* pdf files generated by applications like cleanpdf, that use this
  function, will print normally on 8.5 x 11 inch paper.
  • Loading branch information
DanBloomberg committed Jul 8, 2024
1 parent 66237e2 commit 913550c
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 101 deletions.
114 changes: 16 additions & 98 deletions prog/cleanpdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,16 @@
*
* Whenever possible, the images will be deskewed.
*
* As the first step in processing, images are saved in the ./image
* directory as RGB at 300 ppi in ppm format. Each image is about 26MB.
* Delete those images after use.
* As the first step in processing, images are saved in the directory
* /tmp/lept/renderpdf/, as RGB at 300 ppi in ppm format. Each image
* is about 26MB.
*
* Some pdf files have oversize media boxes. PDF is a
* resolution-independent format for storing data that can be imaged.
* Usually the data is stored in fonts, which are a description of the
* shape that can be rendered at different image resolutions. We deal
* here with images that are made up of a fixed number of pixels, and
* thus are not resolution independent. It is necessary for image
* shape that can be rendered at different image resolutions. We often
* deal here with images that are made up of a fixed number of pixels,
* and thus are not resolution independent. It is necessary for image
* specification to include data for the renderer that says how big
* (in inches) to display or print the image. That is done with /MediaBox,
* whose 3rd and 4th parameters are the width and height of the output
Expand All @@ -119,8 +119,8 @@
* * To get an output filename with spaces, use single quotes; e.g.,
* cleanpdf dir [...] title 'quoted filename with spaces'
*
* N.B. This requires the Poppler package of pdf utilities, such as
* pdfimages and pdftoppm. For non-unix systems, this requires
* N.B. This requires running pdfimages from the Poppler package
* of pdf utilities. For non-unix systems, this requires
* installation of the cygwin Poppler package:
* https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/
* poppler-0.26.5-1
Expand All @@ -130,30 +130,14 @@
#include <config_auto.h>
#endif /* HAVE_CONFIG_H */

#ifdef _WIN32
# if defined(_MSC_VER) || defined(__MINGW32__)
# include <direct.h>
# else
# include <io.h>
# endif /* _MSC_VER || __MINGW32__ */
#endif /* _WIN32 */

/* Set to 1 to use pdftoppm (recommended); 0 for pdfimages */
#define USE_PDFTOPPM 1

#include "string.h"
#include <sys/stat.h>
#include <sys/types.h>
#include "allheaders.h"

l_int32 main(int argc,
char **argv)
{
char buf[256];
char *basedir, *fname, *tail, *basename, *imagedir, *firstfile, *title;
char *fileout;
l_int32 i, n, res, contrast, rotation, opensize, render_res, ret;
SARRAY *sa;
char *basedir, *title, *fileout;
l_int32 res, contrast, rotation, opensize, render_res;
SARRAY *safiles;

if (argc != 8)
return ERROR_INT(
Expand Down Expand Up @@ -192,81 +176,15 @@ SARRAY *sa;
}
setLeptDebugOK(1);

/* Set up a directory for temp images */
if ((imagedir = stringJoin(basedir, "/image")) == NULL)
return ERROR_INT_1("imagedir from basedir not found", basedir,
/* Render all images from pdfs */
if (l_pdfRenderFiles(basedir, NULL, 300, &safiles))
return ERROR_INT_1("rendering failed from basedir", basedir,
__func__, 1);
#ifndef _WIN32
mkdir(imagedir, 0777);
#else
_mkdir(imagedir);
#endif /* _WIN32 */

/* Get the names of the input pdf files */
if ((sa = getSortedPathnamesInDirectory(basedir, "pdf", 0, 0)) == NULL)
return ERROR_INT("files not found", __func__, 1);
sarrayWriteStderr(sa);
n = sarrayGetCount(sa);

/* Figure out the resolution to use with the image renderer.
This first checks the media box sizes, which give the output
image size in printer points (1/72 inch). The largest expected
output image has a max dimension of about 11 inches, corresponding
to 792 points. At a resolution of 300 ppi, the max image size
is then 3300. For robustness, use the median of media box sizes.
If the max dimension of this median is significantly larger than
792, reduce the input resolution to the renderer. Specifically:
* Calculate the median of the MediaBox widths and heights.
* If the max exceeds 850, reduce the resolution so that the max
dimension of the rendered image is 3300. The new resolution
input to the renderer is reduced from 300 by the factor:
(792 / medmax)
If the media boxes are not found, render a page using a small
given resolution (72) and use the max dimension to find the
resolution that will produce a 3300 pixel size output. */
firstfile = sarrayGetString(sa, 0, L_NOCOPY);
getPdfRendererResolution(firstfile, imagedir, &render_res);

/* Rasterize: use either
* pdftoppm -r res fname outroot (-r res renders output at res ppi)
* or
* pdfimages -j fname outroot (-j outputs jpeg if input is dct)
* Use of pdftoppm:
* This works on all pdf pages, both wrapped images and pages that
* were made orthographically. The default output resolution for
* pdftoppm is 150 ppi, but we use 300 ppi. This makes large
* uncompressed files (e.g., a standard size RGB page image at 300
* ppi is 25 MB), but it is very fast. This is now preferred over
* using pdfimages.
* Use of pdfimages:
* This only works when all pages are pdf wrappers around images.
* In some cases, it scrambles the order of the output pages
* and inserts extra images. */
for (i = 0; i < n; i++) {
fname = sarrayGetString(sa, i, L_NOCOPY);
splitPathAtDirectory(fname, NULL, &tail);
splitPathAtExtension(tail, &basename, NULL);
#if USE_PDFTOPPM
snprintf(buf, sizeof(buf), "pdftoppm -r %d %s %s/%s",
render_res, fname, imagedir, basename);
#else
snprintf(buf, sizeof(buf), "pdfimages -j %s %s/%s",
fname, imagedir, basename);
#endif /* USE_PDFTOPPM */
lept_free(tail);
lept_free(basename);
lept_stderr("%s\n", buf);
callSystemDebug(buf); /* pdfimages or pdftoppm */
}
sarrayDestroy(&sa);

/* Clean, deskew and compress */
sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0);
lept_free(imagedir);
sarrayWriteStderr(sa);
lept_stderr("cleaning ...\n");
cleanTo1bppFilesToPdf(sa, res, contrast, rotation, opensize,
cleanTo1bppFilesToPdf(safiles, res, contrast, rotation, opensize,
title, fileout);
sarrayDestroy(&sa);
sarrayDestroy(&safiles);
return 0;
}
2 changes: 1 addition & 1 deletion src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ libleptonica_la_SOURCES = adaptmap.c affine.c \
quadtree.c queue.c rank.c rbtree.c \
readbarcode.c readfile.c \
recogbasic.c recogdid.c recogident.c \
recogtrain.c regutils.c \
recogtrain.c regutils.c renderpdf.c \
rop.c roplow.c \
rotate.c rotateam.c rotateorth.c rotateshear.c \
runlength.c sarray1.c sarray2.c \
Expand Down
2 changes: 2 additions & 0 deletions src/allheaders.h
Original file line number Diff line number Diff line change
Expand Up @@ -2320,6 +2320,8 @@ LEPT_DLL extern l_ok regTestCompareFiles ( L_REGPARAMS *rp, l_int32 index1, l_in
LEPT_DLL extern l_ok regTestWritePixAndCheck ( L_REGPARAMS *rp, PIX *pix, l_int32 format );
LEPT_DLL extern l_ok regTestWriteDataAndCheck ( L_REGPARAMS *rp, void *data, size_t nbytes, const char *ext );
LEPT_DLL extern char * regTestGenLocalFilename ( L_REGPARAMS *rp, l_int32 index, l_int32 format );
LEPT_DLL extern l_ok l_pdfRenderFile ( const char *filename, l_int32 res, SARRAY **psaout );
LEPT_DLL extern l_ok l_pdfRenderFiles ( const char *dir, SARRAY *sain, l_int32 res, SARRAY **psaout );
LEPT_DLL extern l_ok pixRasterop ( PIX *pixd, l_int32 dx, l_int32 dy, l_int32 dw, l_int32 dh, l_int32 op, PIX *pixs, l_int32 sx, l_int32 sy );
LEPT_DLL extern l_ok pixRasteropVip ( PIX *pixd, l_int32 bx, l_int32 bw, l_int32 vshift, l_int32 incolor );
LEPT_DLL extern l_ok pixRasteropHip ( PIX *pixd, l_int32 by, l_int32 bh, l_int32 hshift, l_int32 incolor );
Expand Down
2 changes: 1 addition & 1 deletion src/makefile.static
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ LEPTLIB_C = adaptmap.c affine.c \
ptra.c quadtree.c queue.c rank.c rbtree.c \
readbarcode.c readfile.c \
recogbasic.c recogdid.c recogident.c recogtrain.c \
regutils.c rop.c roplow.c \
regutils.c renderpdf.c rop.c roplow.c \
rotate.c rotateam.c rotateorth.c rotateshear.c \
runlength.c sarray1.c sarray2.c \
scale1.c scale2.c seedfill.c \
Expand Down
Loading

0 comments on commit 913550c

Please sign in to comment.