From 913550c70783f6fb16dba860ff5d33cbc5e85b63 Mon Sep 17 00:00:00 2001 From: danblooomberg Date: Mon, 8 Jul 2024 12:44:47 -0600 Subject: [PATCH] Add renderpdf.c to library * This implements a function used in several programs, that takes a set of input pdf files that are assumed to be page images, and renders every page in a temp directory at a requested resolution. * Modified prog/cleanpdf to use this function; checked with previous implementation; used valgrind. * The new implementation is better because it writes the output files in a temp directory that is cleaned out with each invocation of the function. In the previous implementation, it was necessary to remove previously rendered files by hand. * The new implementation allows output images to be rendered at resolutions between 50 and 300 ppi, independent of the actual resolution of the input images wrapped in the pdf files. This is done by assuming the input pages are 612 x 792 printer points. * pdf files generated by applications like cleanpdf, that use this function, will print normally on 8.5 x 11 inch paper. --- prog/cleanpdf.c | 114 +++--------------- src/Makefile.am | 2 +- src/allheaders.h | 2 + src/makefile.static | 2 +- src/renderpdf.c | 279 ++++++++++++++++++++++++++++++++++++++++++++ version-notes.html | 6 +- 6 files changed, 304 insertions(+), 101 deletions(-) create mode 100644 src/renderpdf.c diff --git a/prog/cleanpdf.c b/prog/cleanpdf.c index 939b3917f..5670552d6 100644 --- a/prog/cleanpdf.c +++ b/prog/cleanpdf.c @@ -88,16 +88,16 @@ * * Whenever possible, the images will be deskewed. * - * As the first step in processing, images are saved in the ./image - * directory as RGB at 300 ppi in ppm format. Each image is about 26MB. - * Delete those images after use. + * As the first step in processing, images are saved in the directory + * /tmp/lept/renderpdf/, as RGB at 300 ppi in ppm format. Each image + * is about 26MB. * * Some pdf files have oversize media boxes. PDF is a * resolution-independent format for storing data that can be imaged. * Usually the data is stored in fonts, which are a description of the - * shape that can be rendered at different image resolutions. We deal - * here with images that are made up of a fixed number of pixels, and - * thus are not resolution independent. It is necessary for image + * shape that can be rendered at different image resolutions. We often + * deal here with images that are made up of a fixed number of pixels, + * and thus are not resolution independent. It is necessary for image * specification to include data for the renderer that says how big * (in inches) to display or print the image. That is done with /MediaBox, * whose 3rd and 4th parameters are the width and height of the output @@ -119,8 +119,8 @@ * * To get an output filename with spaces, use single quotes; e.g., * cleanpdf dir [...] title 'quoted filename with spaces' * - * N.B. This requires the Poppler package of pdf utilities, such as - * pdfimages and pdftoppm. For non-unix systems, this requires + * N.B. This requires running pdfimages from the Poppler package + * of pdf utilities. For non-unix systems, this requires * installation of the cygwin Poppler package: * https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/ * poppler-0.26.5-1 @@ -130,30 +130,14 @@ #include #endif /* HAVE_CONFIG_H */ -#ifdef _WIN32 -# if defined(_MSC_VER) || defined(__MINGW32__) -# include -# else -# include -# endif /* _MSC_VER || __MINGW32__ */ -#endif /* _WIN32 */ - - /* Set to 1 to use pdftoppm (recommended); 0 for pdfimages */ -#define USE_PDFTOPPM 1 - -#include "string.h" -#include -#include #include "allheaders.h" l_int32 main(int argc, char **argv) { -char buf[256]; -char *basedir, *fname, *tail, *basename, *imagedir, *firstfile, *title; -char *fileout; -l_int32 i, n, res, contrast, rotation, opensize, render_res, ret; -SARRAY *sa; +char *basedir, *title, *fileout; +l_int32 res, contrast, rotation, opensize, render_res; +SARRAY *safiles; if (argc != 8) return ERROR_INT( @@ -192,81 +176,15 @@ SARRAY *sa; } setLeptDebugOK(1); - /* Set up a directory for temp images */ - if ((imagedir = stringJoin(basedir, "/image")) == NULL) - return ERROR_INT_1("imagedir from basedir not found", basedir, + /* Render all images from pdfs */ + if (l_pdfRenderFiles(basedir, NULL, 300, &safiles)) + return ERROR_INT_1("rendering failed from basedir", basedir, __func__, 1); - #ifndef _WIN32 - mkdir(imagedir, 0777); - #else - _mkdir(imagedir); - #endif /* _WIN32 */ - - /* Get the names of the input pdf files */ - if ((sa = getSortedPathnamesInDirectory(basedir, "pdf", 0, 0)) == NULL) - return ERROR_INT("files not found", __func__, 1); - sarrayWriteStderr(sa); - n = sarrayGetCount(sa); - - /* Figure out the resolution to use with the image renderer. - This first checks the media box sizes, which give the output - image size in printer points (1/72 inch). The largest expected - output image has a max dimension of about 11 inches, corresponding - to 792 points. At a resolution of 300 ppi, the max image size - is then 3300. For robustness, use the median of media box sizes. - If the max dimension of this median is significantly larger than - 792, reduce the input resolution to the renderer. Specifically: - * Calculate the median of the MediaBox widths and heights. - * If the max exceeds 850, reduce the resolution so that the max - dimension of the rendered image is 3300. The new resolution - input to the renderer is reduced from 300 by the factor: - (792 / medmax) - If the media boxes are not found, render a page using a small - given resolution (72) and use the max dimension to find the - resolution that will produce a 3300 pixel size output. */ - firstfile = sarrayGetString(sa, 0, L_NOCOPY); - getPdfRendererResolution(firstfile, imagedir, &render_res); - - /* Rasterize: use either - * pdftoppm -r res fname outroot (-r res renders output at res ppi) - * or - * pdfimages -j fname outroot (-j outputs jpeg if input is dct) - * Use of pdftoppm: - * This works on all pdf pages, both wrapped images and pages that - * were made orthographically. The default output resolution for - * pdftoppm is 150 ppi, but we use 300 ppi. This makes large - * uncompressed files (e.g., a standard size RGB page image at 300 - * ppi is 25 MB), but it is very fast. This is now preferred over - * using pdfimages. - * Use of pdfimages: - * This only works when all pages are pdf wrappers around images. - * In some cases, it scrambles the order of the output pages - * and inserts extra images. */ - for (i = 0; i < n; i++) { - fname = sarrayGetString(sa, i, L_NOCOPY); - splitPathAtDirectory(fname, NULL, &tail); - splitPathAtExtension(tail, &basename, NULL); - #if USE_PDFTOPPM - snprintf(buf, sizeof(buf), "pdftoppm -r %d %s %s/%s", - render_res, fname, imagedir, basename); - #else - snprintf(buf, sizeof(buf), "pdfimages -j %s %s/%s", - fname, imagedir, basename); - #endif /* USE_PDFTOPPM */ - lept_free(tail); - lept_free(basename); - lept_stderr("%s\n", buf); - callSystemDebug(buf); /* pdfimages or pdftoppm */ - } - sarrayDestroy(&sa); /* Clean, deskew and compress */ - sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0); - lept_free(imagedir); - sarrayWriteStderr(sa); lept_stderr("cleaning ...\n"); - cleanTo1bppFilesToPdf(sa, res, contrast, rotation, opensize, + cleanTo1bppFilesToPdf(safiles, res, contrast, rotation, opensize, title, fileout); - sarrayDestroy(&sa); + sarrayDestroy(&safiles); return 0; } diff --git a/src/Makefile.am b/src/Makefile.am index 8997c6349..d6e9e4770 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -51,7 +51,7 @@ libleptonica_la_SOURCES = adaptmap.c affine.c \ quadtree.c queue.c rank.c rbtree.c \ readbarcode.c readfile.c \ recogbasic.c recogdid.c recogident.c \ - recogtrain.c regutils.c \ + recogtrain.c regutils.c renderpdf.c \ rop.c roplow.c \ rotate.c rotateam.c rotateorth.c rotateshear.c \ runlength.c sarray1.c sarray2.c \ diff --git a/src/allheaders.h b/src/allheaders.h index 01ba4eb49..c16ee3ce6 100644 --- a/src/allheaders.h +++ b/src/allheaders.h @@ -2320,6 +2320,8 @@ LEPT_DLL extern l_ok regTestCompareFiles ( L_REGPARAMS *rp, l_int32 index1, l_in LEPT_DLL extern l_ok regTestWritePixAndCheck ( L_REGPARAMS *rp, PIX *pix, l_int32 format ); LEPT_DLL extern l_ok regTestWriteDataAndCheck ( L_REGPARAMS *rp, void *data, size_t nbytes, const char *ext ); LEPT_DLL extern char * regTestGenLocalFilename ( L_REGPARAMS *rp, l_int32 index, l_int32 format ); +LEPT_DLL extern l_ok l_pdfRenderFile ( const char *filename, l_int32 res, SARRAY **psaout ); +LEPT_DLL extern l_ok l_pdfRenderFiles ( const char *dir, SARRAY *sain, l_int32 res, SARRAY **psaout ); LEPT_DLL extern l_ok pixRasterop ( PIX *pixd, l_int32 dx, l_int32 dy, l_int32 dw, l_int32 dh, l_int32 op, PIX *pixs, l_int32 sx, l_int32 sy ); LEPT_DLL extern l_ok pixRasteropVip ( PIX *pixd, l_int32 bx, l_int32 bw, l_int32 vshift, l_int32 incolor ); LEPT_DLL extern l_ok pixRasteropHip ( PIX *pixd, l_int32 by, l_int32 bh, l_int32 hshift, l_int32 incolor ); diff --git a/src/makefile.static b/src/makefile.static index 71c0940a1..8b3af559e 100644 --- a/src/makefile.static +++ b/src/makefile.static @@ -217,7 +217,7 @@ LEPTLIB_C = adaptmap.c affine.c \ ptra.c quadtree.c queue.c rank.c rbtree.c \ readbarcode.c readfile.c \ recogbasic.c recogdid.c recogident.c recogtrain.c \ - regutils.c rop.c roplow.c \ + regutils.c renderpdf.c rop.c roplow.c \ rotate.c rotateam.c rotateorth.c rotateshear.c \ runlength.c sarray1.c sarray2.c \ scale1.c scale2.c seedfill.c \ diff --git a/src/renderpdf.c b/src/renderpdf.c new file mode 100644 index 000000000..3fb085aca --- /dev/null +++ b/src/renderpdf.c @@ -0,0 +1,279 @@ +/*====================================================================* + - Copyright (C) 2001 Leptonica. All rights reserved. + - + - Redistribution and use in source and binary forms, with or without + - modification, are permitted provided that the following conditions + - are met: + - 1. Redistributions of source code must retain the above copyright + - notice, this list of conditions and the following disclaimer. + - 2. Redistributions in binary form must reproduce the above + - copyright notice, this list of conditions and the following + - disclaimer in the documentation and/or other materials + - provided with the distribution. + - + - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY + - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *====================================================================*/ + +/*! + * \file renderpdf.c + *
+ *
+ *   Rendering pdf files using an external library
+ *        l_int32     l_pdfRenderFile()
+ *        l_int32     l_pdfRenderFiles()
+ *
+ *   Utility for rendering a set of pdf files as page images.
+ *   The images are rendered for full page images at a specified
+ *   resolution between 50 and 300 ppi, in the directory
+ *       /tmp/lept/renderpdf/
+ *
+ *   An application like cleanpdf performs a sequence of:
+ *   (1) rendering the pdfs into a set of images,
+ *   (2) doing image processing on each image to generate new images, and
+ *   (3) wrapping the new images up in a single pdf file.
+ *   Typically, the processed images made by step (2) are stored compressed
+ *   in memory in a PixaComp, before wrapping them up in step (3).
+ *
+ *   This requires the Poppler package of pdf utilities, in particular
+ *   the program pdftoppm.  For non-unix systems, this requires
+ *   installation of the cygwin Poppler package:
+ *      https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/
+ *            poppler-0.26.5-1
+ *
+ *   For the rasterizer, use pdftoppm:
+ *      pdftoppm -r res fname outroot  ('-r res' renders output at res ppi)
+ *   This works on all pdf pages, both wrapped images and pages that
+ *   were made orthographically.  The default output resolution for
+ *   pdftoppm is 150 ppi, but we typically use 300 ppi.  This makes large
+ *   uncompressed RGB image files (e.g., a standard size RGB page image
+ *   at 300 ppi is 25 MB), but it is very fast.
+ *
+ *   The size of the resulting images does not depend on the resolution
+ *   of the images stored in the input pdf.  We compute the value of the
+ *   resolution parameter (render_res) that when input to pdftoppm
+ *   will generate a page-size image (612 x 792 pts) at the requested
+ *   output resolution.
+ *
+ *   We do NOT use pdfimages:
+ *      pdfimages -j fname outroot   (-j outputs jpeg if input is dct)
+ *   pdfimages only works when all pages are pdf wrappers around images.
+ *   Further, in some cases, it scrambles the order of the output pages
+ *   and inserts extra images.
+
+ *   By default, this function will not run, because it makes a call
+ *   to system(1).  To render pdfs as a set of images in a directory,
+ *   three things are required:
+ *   (1) To have poppler installed.
+ *   (2) To enable debug operations using setLeptDebugOK(1).
+ *   (3) To link the functions that generate pdf files in the library
+ *       (in pdfio1.c, pdfio2.c).
+ * 
+ */ + +#ifdef HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#include "allheaders.h" + +/* --------------------------------------------*/ +#if USE_PDFIO /* defined in environ.h */ +/* --------------------------------------------*/ + +/*-----------------------------------------------------------------* + * Rendering pdf files using an external library * + *-----------------------------------------------------------------*/ +/*! + * \brief l_pdfRenderFile() + * + * \param[in] filename input pdf file + * \param[in] res output resolution (0, [50 ... 300]) ppi + * \param[out] psaout sarray of filenames of rasterized images + * \return 0 if OK, 1 on error + * + *
+ * Notes:
+ *      (1) Wrapper to l_padfRenderFiles() for a single input pdf file.
+ * 
+ */ +l_ok +l_pdfRenderFile(const char *filename, + l_int32 res, + SARRAY **psaout) +{ +l_int32 ret; +SARRAY *sain; + + if (!psaout) + return ERROR_INT("&saout not defined", __func__, 1); + *psaout = NULL; + if (!filename) + return ERROR_INT("filename not defined", __func__, 1); + + sain = sarrayCreate(1); + sarrayAddString(sain, filename, L_COPY); + ret = l_pdfRenderFiles(NULL, sain, res, psaout); + sarrayDestroy(&sain); + return ret; +} + + +/*! + * \brief l_pdfRenderFiles() + * + * \param[in] dir directory of input pdf files + * \param[in] sain sarray of input pdf filenames + * \param[in] res output resolution (0, [50 ... 300]) ppi + * \param[out] psaout sarray of output filenames of rendered images + * \return 0 if OK, 1 on error + * + *
+ * Notes:
+ *      (1) Because this uses the "system" call, it is disabled by default
+ *          on all platforms.  It is not supported and therefor3 disabled
+ *          on iOS 11.
+ *      (2) Input pdf file(s) are specified either by an input directory
+ *          or an sarray with the paths.  Use the sarray if it is given;
+ *          otherwise, use all files in the directory with extention "pdf",
+ *          and name the rendered images in lexical order of the filenames.
+ *      (3) The allowed output rendering resolutions are between 50 ppi
+ *          and 300 ppi.  Typical resolutions are 150 and 300 ppi.
+ *          Default input value of 0 can be used for 300 ppi resolution.
+ *      (4) Images are rendered in ppm format in directory /tmp/lept/renderpdf
+ *          and named in lexical order of the input filenames.  On invocation,
+ *          any existing files in this directory are removed.
+ *      (5) This requires pdftoppm from the Poppler package of pdf utilities.
+ * 
+ */ +l_ok +l_pdfRenderFiles(const char *dir, + SARRAY *sain, + l_int32 res, + SARRAY **psaout) +{ +char buf[256]; +char *imagedir, *firstfile, *fname, *basename, *tail; +l_int32 i, nfiles, render_res; +SARRAY *sa; + + if (!LeptDebugOK) { + L_INFO("running pdftoppm is disabled; " + "use setLeptDebugOK(1) to enable\n", __func__); + return 0; + } + + #ifdef OS_IOS /* iOS 11 does not support system() */ + return ERROR_INT("iOS 11 does not support system()", __func__, 0); + #endif /* OS_IOS */ + + if (!psaout) + return ERROR_INT("&saout not defined", __func__, 1); + *psaout = NULL; + if (res == 0) res = 300; + if (res < 50 || res > 300) + return ERROR_INT("res not in range [50 ... 300]", __func__, 1); + if (!dir && !sain) + return ERROR_INT("neither dir or sain are defined", __func__, 1); + if (sain) { + sa = sarrayCopy(sain); + } else { + sa = getSortedPathnamesInDirectory(dir, "pdf", 0, 0); + if (!sa) + return ERROR_INT("no files found in dir", __func__, 1); + } + nfiles = sarrayGetCount(sa); + + /* Set up directory for rendered page images. */ + lept_rmdir("lept/renderpdf"); + lept_mkdir("lept/renderpdf"); + imagedir = genPathname("/tmp/lept/renderpdf", NULL); + + /* Figure out the resolution to use with the image renderer. + This first checks the media box sizes, which give the output + image size in printer points (1/72 inch). The largest expected + output image has a max dimension of about 11 inches, corresponding + to 792 points. At a resolution of 300 ppi, the max image size + is then 3300. For robustness, use the median of media box sizes. + If the max dimension of this median is significantly larger than + 792, reduce the input resolution to the renderer. Specifically: + * Calculate the median of the MediaBox widths and heights. + * If the max exceeds 850, reduce the resolution so that the max + dimension of the rendered image is 3300. The new resolution + input to the renderer is reduced from 300 by the factor: + (792 / medmax) + If the media boxes are not found, render a page using a small + given resolution (72) and use the max dimension to find the + resolution, render_res, that will produce an out with + 3300 pixels in the largest dimension. */ + firstfile = sarrayGetString(sa, 0, L_NOCOPY); + getPdfRendererResolution(firstfile, imagedir, &render_res); + + /* The input %res gives the actual resolution at which the page is + to be rendered. If this is less than 300 ppi, reduce render_res, + the resolution input to pdftoppm, by the factor: + (res / 300) */ + render_res = (render_res * res) / 300; + + /* Rasterize: '-r res' renders output at res ppi + * pdftoppm -r res fname outroot */ + for (i = 0; i < nfiles; i++) { + fname = sarrayGetString(sa, i, L_NOCOPY); + splitPathAtDirectory(fname, NULL, &tail); + splitPathAtExtension(tail, &basename, NULL); + snprintf(buf, sizeof(buf), "pdftoppm -r %d %s %s/%s", + render_res, fname, imagedir, basename); + lept_free(tail); + lept_free(basename); + lept_stderr("%s\n", buf); + callSystemDebug(buf); /* pdftoppm */ + } + sarrayDestroy(&sa); + + /* Generate the output array of image file names */ + *psaout = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0); + lept_free(imagedir); + return 0; +} + + +/* --------------------------------------------*/ +#endif /* USE_PDFIO */ +/* --------------------------------------------*/ + + + +/* ------------------------------------------------------------------------- * + * Stubs if pdf is not supported * + * ------------------------------------------------------------------------- */ + +/* -----------------------------------------------------------------*/ +#if !USE_PDFIO +/* -----------------------------------------------------------------*/ + +l_ok l_pdfRenderFile(const char *filename, l_int32 res, SARRAY **psaout) +{ + return ERROR_INT("function not present", __func__, 1); +} + +/* -----------------------------------------------------------*/ + +l_ok l_pdfRenderFiles(const char *dir, SARRAY *sain, l_int32 res, + SARRAY **psaout) +{ + return ERROR_INT("function not present", __func__, 1); +} + +/* -----------------------------------------------------------------*/ +#endif /* !USE_PDFIO */ +/* -----------------------------------------------------------------*/ + diff --git a/version-notes.html b/version-notes.html index edd985f1d..3701777e3 100644 --- a/version-notes.html +++ b/version-notes.html @@ -104,12 +104,16 @@

* Added misctest2.c to show crop and clean page functions. * Added page crop option for removing noise on left and right sides. * Added page crop option to allow printing to full width of paper. + * Added renderpdf.c to library, to render pdfs as page images with + a specified resolution. + * Modified cleanpdf.c to use l_pdfRenderFiles(). * Source files changed: adaptmap.c affinecompose.c, colormap.c, compare.c, gplot.c, grayquant.c, jbclass.c, jp2kheader.c, jp2kheaderstub.c, jp2kio.c, libversions.c, morphseq.c, pageseg.c partify.c, pdfapp.c, pdfio2.c, pixconv.c, - psio2.c, readfile.c, sel1.c, utils2.c, writefile.c, + psio2.c, readfile.c, renderpdf.c, + sel1.c, utils2.c, writefile.c, allheaders.h, environ.h * Prog files changed: alltests_reg.c, binmorph3_reg.c, blend2_reg.c, cleanpdf.c, compressedpdf.c, croppdf.c,