From 2b1f02aea44f7aed103defc3f67bfb700d76988c Mon Sep 17 00:00:00 2001 From: danblooomberg Date: Sat, 20 Jul 2024 19:49:34 -0600 Subject: [PATCH] Modify prog/croppdf.c and prog/compresspdf.c to use pdf rendering function * This cleans up and simplifies these functions. --- prog/cleanpdf.c | 2 +- prog/compresspdf.c | 108 ++++++++++--------------------------- prog/croppdf.c | 130 ++++++++++++++++----------------------------- src/pageseg.c | 9 ++-- src/pdfapp.c | 32 ++++++----- 5 files changed, 100 insertions(+), 181 deletions(-) diff --git a/prog/cleanpdf.c b/prog/cleanpdf.c index 5670552d6..8c77ec695 100644 --- a/prog/cleanpdf.c +++ b/prog/cleanpdf.c @@ -119,7 +119,7 @@ * * To get an output filename with spaces, use single quotes; e.g., * cleanpdf dir [...] title 'quoted filename with spaces' * - * N.B. This requires running pdfimages from the Poppler package + * N.B. This requires running pdftoppm from the Poppler package * of pdf utilities. For non-unix systems, this requires * installation of the cygwin Poppler package: * https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/ diff --git a/prog/compresspdf.c b/prog/compresspdf.c index de28541a9..84639290a 100644 --- a/prog/compresspdf.c +++ b/prog/compresspdf.c @@ -42,12 +42,6 @@ * be both set to 1. Then the pages with color are compressed with DCT * and the monochrome pages are compressed with tiffg4. * - * The first step is to render the images as RGB, using Poppler's pdftoppm. - * Compare compresspdf with cleanpdf, which carries out several cleanup - * operations, such as deskewing and adaptive thresholding to clean - * noisy or dark backgrounds in grayscale or color images, resulting - * in high resolution, 1 bpp tiffg4 encoded images in the pdf. - * * Syntax: * compresspdf basedir imres scalefactor onebit savecolor * quality title fileout @@ -65,11 +59,11 @@ * * The %scalefactor is typically used to downscale the image to * reduce the size of the generated pdf. It should not affect the - * pdf display otherwise. For normal text on images scanned at 300 ppi, - * a 2x reduction (%scalefactor = 0.5) may be satisfactory. - * We compute an output resolution for that pdf that will cause it - * to print 11 inches high, based on the height in pixels of the - * first image in the set. + * pdf display otherwise. The maximum allowed value is 2.0. + * For normal text on images scanned at 300 ppi, a 2x reduction + * (%scalefactor = 0.5) may be satisfactory. We compute an output + * resolution for that pdf that will cause it to print 11 inches high, + * based on the height in pixels of the first image in the set. * * Images are saved in the ./image directory as RGB in ppm format. * If the %onebit flag is 0, these will be encoded in the output pdf @@ -92,7 +86,11 @@ * The pdf output is written to %fileout. It is advisable (but not * required) to have a '.pdf' extension. * - * The intent is to use pdftoppm to render the images at 150 pixels/inch + * As the first step in processing, images are saved in the directory + * /tmp/lept/renderpdf/, as RGB at 300 ppi in ppm format. Each image + * is about 26MB. + * + * We use pdftoppm to render the images at (typically) 150 pixels/inch * for a full page, when scalefactor = 1.0. The renderer uses the * mediaboxes to decide how big to make the images. If those boxes * have values that are too large, the intermediate ppm images can @@ -101,8 +99,8 @@ * at about 150 ppi (when scalefactor = 1.0). These images are about * 6MB, but are written quickly because there is no compression. * - * N.B. This requires the Poppler package of pdf utilities, such as - * pdfimages and pdftoppm. For non-unix systems, this requires + * N.B. This requires running pdftoppm from the Poppler package + * of pdf utilities For non-unix systems, this requires * installation of the cygwin Poppler package: * https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/ * poppler-0.26.5-1 @@ -112,27 +110,16 @@ #include #endif /* HAVE_CONFIG_H */ -#ifdef _WIN32 -# if defined(_MSC_VER) || defined(__MINGW32__) -# include -# else -# include -# endif /* _MSC_VER || __MINGW32__ */ -#endif /* _WIN32 */ - -#include "string.h" -#include -#include #include "allheaders.h" l_int32 main(int argc, char **argv) { char buf[256]; -char *basedir, *fname, *tail, *basename, *imagedir, *title, *fileout; -l_int32 imres, render_res, onebit, savecolor, quality, i, n, ret; +char *basedir, *title, *fileout; +l_int32 imres, render_res, onebit, savecolor, quality; l_float32 scalefactor; -SARRAY *sa; +SARRAY *safiles; if (argc != 9) return ERROR_INT( @@ -146,13 +133,18 @@ SARRAY *sa; quality = atoi(argv[6]); /* jpeg quality */ title = argv[7]; fileout = argv[8]; - setLeptDebugOK(1); - if (imres == 0) imres = 150; /* default value */ + if (imres <= 0) imres = 150; /* default value */ if (imres != 150 && imres != 300) { L_WARNING("imres = %d must be 150 or 300; setting to 150\n", __func__, imres); imres = 150; } + if (scalefactor <= 0.0) scalefactor = 1.0; + if (scalefactor > 2.0) { + L_WARNING("scalefactor %f too big; setting to 2.0\n", __func__, + scalefactor); + scalefactor = 2.0; + } if (quality <= 0) quality = 50; /* default value */ if (quality < 25) { L_WARNING("quality = %d is too low; setting to 25\n", @@ -164,53 +156,12 @@ SARRAY *sa; __func__, quality); quality = 95; } + setLeptDebugOK(1); - /* Set up a directory for temp images */ - if ((imagedir = stringJoin(basedir, "/image")) == NULL) - return ERROR_INT_1("imagedir from basedir not found", basedir, + /* Render all images from pdfs */ + if (l_pdfRenderFiles(basedir, NULL, imres, &safiles)) + return ERROR_INT_1("rendering failed from basedir", basedir, __func__, 1); -#ifndef _WIN32 - mkdir(imagedir, 0777); -#else - _mkdir(imagedir); -#endif /* _WIN32 */ - - /* Get the names of the pdf files */ - if ((sa = getSortedPathnamesInDirectory(basedir, "pdf", 0, 0)) == NULL) - return ERROR_INT("files not found", __func__, 1); - sarrayWriteStderr(sa); - n = sarrayGetCount(sa); - - /* Use the first pdf file in the directory to estimate the - * resolution to use with the image renderer that will generate - * page images with a resolution of either about 150 ppi - * (which is the default) or about 300 ppi for special cases. - * At 150 and 300 ppi, the page images have maximum dimensions - * of about 1650 and 3300 pixels, respectively. These are the - * uncompressed images, written to file, from which the compressed - * images will be generated. */ - fname = sarrayGetString(sa, 0, L_NOCOPY); - getPdfRendererResolution(fname, imagedir, &render_res); /* for 300 ppi */ - if (imres == 150) render_res /= 2; - - /* Rasterize: - * pdftoppm -r 150 fname outroot [max dimension about 1650 pixels] - * pdftoppm -r 300 fname outroot [max dimension about 3300 pixels] - * Use of pdftoppm: - * This works on all pdf pages, both wrapped images and pages that - * were made orthographically. */ - for (i = 0; i < n; i++) { - fname = sarrayGetString(sa, i, L_NOCOPY); - splitPathAtDirectory(fname, NULL, &tail); - splitPathAtExtension(tail, &basename, NULL); - snprintf(buf, sizeof(buf), "pdftoppm -r %d %s %s/%s", - render_res, fname, imagedir, basename); - lept_free(tail); - lept_free(basename); - lept_stderr("%s\n", buf); - callSystemDebug(buf); - } - sarrayDestroy(&sa); /* Optionally binarize, then scale and collect all images in memory. * If n > 100, use pixacomp instead of pixa to store everything @@ -219,13 +170,10 @@ SARRAY *sa; * the images in leptonica. Do not let 'pdftoppm -mono' do * the binarization, because it will apply error-diffusion * dithering to gray and color images. */ - sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0); - lept_free(imagedir); - sarrayWriteStderr(sa); lept_stderr("compressing ...\n"); - compressFilesToPdf(sa, onebit, savecolor, scalefactor, quality, + compressFilesToPdf(safiles, onebit, savecolor, scalefactor, quality, title, fileout); - sarrayDestroy(&sa); + sarrayDestroy(&safiles); return 0; } diff --git a/prog/croppdf.c b/prog/croppdf.c index dcc2f457b..158a8e2cc 100644 --- a/prog/croppdf.c +++ b/prog/croppdf.c @@ -34,29 +34,34 @@ * scales the width to fill a printed page. See documentation for * pixCropImage() for the parameters. * - * The pdfs are concatenated in lexical order, and each image - * is encoded with tiffg4. + * The pdfs are concatenated in lexical order. Each image is 1 bpp + * and is encoded with tiffg4. * * Syntax: - * croppdf basedir lrclear tbclear edgeclean lradd tbadd maxwiden - * printwiden title fileout + * croppdf basedir lrclear tbclear edgeclean lrborder tbborder + * maxwiden printwiden title fileout * - * The %basedir is a directory where the input pdf files are located. + * Typical parameters for an invocation are: + * croppdf . 50 50 5 70 70 1.12 1.1 none + * + * Parameter %basedir is a directory where the input pdf files are located. * The program will operate on every file in this directory with - * the ".pdf" extension. + * the ".pdf" extension, taking them in lexical order. * - * The %lrclear and %tbclear parameters give the number of background - * pixels to be added to the foreground region. + * Parameter %lrclear and %tbclear parameters give the width of the + * regions at the left-right and top-bottom edges of the input image + * that are cleared to background as first step in the processing. * - * The %edgeclean parameter is used to remove edge noise: + * The %edgeclean parameter is used to remove noise that is typically + * near the edges of the image: * -1: aggressively removes left and right side noise * 0: default, no removal * 1-15: removal of random noise, where 15 is maximally aggressive * - * The suggested value for %lradd and %tbadd is 50. Laser printers do not - * print foreground pixels very close to the page edges, and using a - * margin of 50 pixels (1/6" at 300 ppi) should allow all foregrounnd - * pixels to be printed. + * The suggested value for %lrborder and %tbborder is 70. + * Laser printers do not print foreground pixels very close to the + * page edges, and using a margin of 70 pixels (about 1/4" at 300 ppi) + * will allow all foregrounnd pixels to be printed. * * The %maxwiden parameter allows the foreground to better fill an * 8.5 x 11 inch printed page. It gives the maximum fractional horizontal @@ -73,12 +78,21 @@ * The pdf output is written to %fileout. It is advisable (but not * required) to have a '.pdf' extension. * - * As the first step in processing, images are saved in the ./image - * directory as RGB at 300 ppi in ppm format. Each image is about 26MB. - * Delete those images after use. + * The first processing step is render images from the pdf as $GB + * at 300 ppi in ppm format, and to seve them in the directory + * /tmp/lept/renderpdf/. * - * N.B. This requires the Poppler package of pdf utilities, such as - * pdfimages and pdftoppm. For non-unix systems, this requires + * We use pdftoppm to render the images at 300 pixels/inch for a + * full page. The renderer uses the mediaboxes to decide how big + * to make the images. If those boxes have values that are too large, + * the intermediate ppm images can be very large. To prevent that, + * we compute the resolution to input to pdftoppm that results + * in RGB ppm images representing page images at about 300 ppi. + * These images are about 25MB, and are written quickly because + * there is no compression. + + * N.B. This requires running pdftoppm from the Poppler package + * of pdf utilities. For non-unix systems, this requires * installation of the cygwin Poppler package: * https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/ * poppler-0.26.5-1 @@ -88,99 +102,47 @@ #include #endif /* HAVE_CONFIG_H */ -#ifdef _WIN32 -# if defined(_MSC_VER) || defined(__MINGW32__) -# include -# else -# include -# endif /* _MSC_VER || __MINGW32__ */ -#endif /* _WIN32 */ - -#include "string.h" -#include -#include #include "allheaders.h" l_int32 main(int argc, char **argv) { char buf[256]; -char *basedir, *fname, *tail, *basename, *imagedir, *title, *fileout; -l_int32 lrclear, tbclear, edgeclean, lradd, tbadd, printwiden; -l_int32 render_res, i, n, ret; +char *basedir, *title, *fileout; +l_int32 lrclear, tbclear, edgeclean, lrborder, tbborder; +l_int32 printwiden, render_res; l_float32 maxwiden; -SARRAY *sa; +SARRAY *safiles; if (argc != 11) return ERROR_INT( "Syntax: croppdf basedir lrclear tbclear edgeclean " - "lradd tbadd maxwiden printwiden title fileout", __func__, 1); + "lrborder tbborder maxwiden printwiden title fileout", __func__, 1); basedir = argv[1]; lrclear = atoi(argv[2]); tbclear = atoi(argv[3]); edgeclean = atoi(argv[4]); - lradd = atoi(argv[5]); - tbadd = atoi(argv[6]); + lrborder = atoi(argv[5]); + tbborder = atoi(argv[6]); maxwiden = atof(argv[7]); printwiden = atoi(argv[8]); title = argv[9]; fileout = argv[10]; + setLeptDebugOK(1); - /* Set up a directory for temp images */ - if ((imagedir = stringJoin(basedir, "/image")) == NULL) - return ERROR_INT_1("imagedir from basedir not found", basedir, + /* Render all images from pdfs */ + if (l_pdfRenderFiles(basedir, NULL, 300, &safiles)) + return ERROR_INT_1("rendering failed from basedir", basedir, __func__, 1); -#ifndef _WIN32 - mkdir(imagedir, 0777); -#else - _mkdir(imagedir); -#endif /* _WIN32 */ - - /* Get the names of the pdf files */ - if ((sa = getSortedPathnamesInDirectory(basedir, "pdf", 0, 0)) == NULL) - return ERROR_INT("files not found", __func__, 1); - sarrayWriteStderr(sa); - n = sarrayGetCount(sa); - - /* Figure out the resolution to use with the image renderer to - * generate page images with a resolution of not more than 150 ppi. - * These would have a maximum dimension of about 1650 pixels. - * Use the first pdf file in the directory. */ - fname = sarrayGetString(sa, 0, L_NOCOPY); - getPdfRendererResolution(fname, imagedir, &render_res); /* for 300 ppi */ - - /* Rasterize: - * pdftoppm -r 300 fname outroot - * Use of pdftoppm: - * This works on all pdf pages, both wrapped images and pages that - * were made orthographically. We generate images that are no - * larger than about 1650 pixels in the maximum direction. This - * makes uncompressed 6 MB files and is very fast. If you want - * higher resolution 1 bpp output, use cleanpdf.c. */ - for (i = 0; i < n; i++) { - fname = sarrayGetString(sa, i, L_NOCOPY); - splitPathAtDirectory(fname, NULL, &tail); - splitPathAtExtension(tail, &basename, NULL); - snprintf(buf, sizeof(buf), "pdftoppm -r %d %s %s/%s", - render_res, fname, imagedir, basename); - lept_free(tail); - lept_free(basename); - lept_stderr("%s\n", buf); - callSystemDebug(buf); - } - sarrayDestroy(&sa); /* Process each image and collect all resulting 1 bpp images * in memory. If n > 200, use pixacomp instead of pixa to * store the images before generating the pdf. */ - sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0); - lept_free(imagedir); - sarrayWriteStderr(sa); lept_stderr("cropping ...\n"); - cropFilesToPdf(sa, lrclear, tbclear, edgeclean, lradd, tbadd, maxwiden, - printwiden, title, fileout); - sarrayDestroy(&sa); + cropFilesToPdf(safiles, lrclear, tbclear, edgeclean, lrborder, tbborder, + maxwiden, printwiden, title, fileout); + sarrayDestroy(&safiles); return 0; } diff --git a/src/pageseg.c b/src/pageseg.c index 3209a5a7a..1da982ee4 100644 --- a/src/pageseg.c +++ b/src/pageseg.c @@ -554,7 +554,8 @@ PIX *pix1, *pix2, *pix3, *pixd; * Notes: * (1) This binarizes and crops a page image. * (a) Binarizes if necessary and does 2x reduction. - * (b) Clears near the border by %lr_clear/2 and %tb_clear/2 pixels + * (b) Clears near the border by %lr_clear and %tb_clear full + * resolution pixels. (This is done at 2x reduction.) * (c) If %edgeclean > 0, it removes isolated sets of pixels, * using a close/open operation of size %edgeclean + 1. * If %edgeclean < 0, it uses a large vertical morphological @@ -645,7 +646,7 @@ PIXA *pixa1; pix1 = pixBackgroundNormTo1MinMax(pixs, 1, 1); pix2 = pixReduceRankBinary2(pix1, 2, NULL); - /* Clear out border pixels */ + /* Clear out pixels near the image edges */ pixSetOrClearBorder(pix2, lr_clear / 2, lr_clear / 2, tb_clear / 2, tb_clear / 2, PIX_CLR); if (pixa1) pixaAddPix(pixa1, pixScale(pix2, 2.0, 2.0), L_INSERT); @@ -670,6 +671,7 @@ PIXA *pixa1; pixDestroy(&pix2); if (ret) { L_ERROR("no returned b.b. for foreground\n", __func__); + boxDestroy(&box1); pixDestroy(&pix1); pixaDestroy(&pixa1); return NULL; @@ -785,9 +787,8 @@ PIX *pix1; boxa1 = pixConnCompBB(pix1, 8); pixDestroy(&pix1); boxa2 = boxaSort(boxa1, L_SORT_BY_AREA, L_SORT_DECREASING, NULL); - box1 = boxaGetBox(boxa2, 0, L_COPY); if ((n = boxaGetCount(boxa2)) == 1) { - *pbox = box1; + *pbox = boxaGetBox(boxa2, 0, L_COPY); } else { /* 2 or more */ box1 = boxaGetBox(boxa2, 0, L_COPY); box2 = boxaGetBox(boxa2, 1, L_COPY); diff --git a/src/pdfapp.c b/src/pdfapp.c index 9a2e25b03..9a9ef6471 100644 --- a/src/pdfapp.c +++ b/src/pdfapp.c @@ -116,9 +116,10 @@ * causes DCT compression of color images and tiffg4 compression * of monochrome images. * (6) The images will be concatenated in the order given in %sa. - * (7) The scalefactor is applied to each image before encoding. - * If you enter a value <= 0.0, it will be set to 1.0. - * (8) Default jpeg quality is 50; otherwise, quality factors between + * (7) Typically, %scalefactor <= 1.0. It is applied to each image + * before encoding. If you enter a value <= 0.0, it will be set to 1.0. + * The maximum allowed value is 2.0. + * (8) Default jpeg %quality is 50; otherwise, quality factors between * 25 and 95 are enforced. * (9) Page images at 300 ppi are about 8 Mpixels. RGB(A) rasters are * then about 32 MB (1 bpp images are about 1 MB). If there are @@ -148,6 +149,11 @@ PIXAC *pixac1 = NULL; if (!fileout) return ERROR_INT("fileout not defined", __func__, 1); if (scalefactor <= 0) scalefactor = 1.0; + if (scalefactor > 2.0) { + L_WARNING("scalefactor %f too big; setting to 2.0\n", __func__, + scalefactor); + scalefactor = 2.0; + } if (quality <= 0) quality = 50; /* default value */ if (quality < 25) { L_WARNING("quality %d too low; setting to 25\n", __func__, quality); @@ -236,8 +242,8 @@ PIXAC *pixac1 = NULL; * default = 0 (no removal); * 15 is maximally aggressive for random noise * -1 for aggressively removing side noise - * \param[in] lr_add full res expansion of crop box on left and right - * \param[in] tb_add full res expansion of crop box on top and bottom + * \param[in] lr_border full res final "added" pixels on left and right + * \param[in] tb_border full res final "added" pixels on top and bottom * \param[in] maxwiden max fractional horizontal stretch allowed * \param[in] printwiden 0 to skip, 1 for 8.5x11, 2 for A4 * \param[in] title [optional] pdf title; can be null @@ -252,10 +258,11 @@ PIXAC *pixac1 = NULL; * (2) It does the image processing for prog/croppdf.c. * (3) Images in the output pdf are 1 bpp and encoded with tiffg4. * (4) See documentation in pixCropImage() for details on the processing. - * (5) The images will be concatenated in the order given in %sa. - * (6) Page images at 300 ppi are about 1 Mpixels. We allow up to 200 - * uncompressed rasters to be stored in memory. If more than 200 - * pages, the stored images are compressed with tiffg4. + * (5) The images will be concatenated in the order given in %safiles. + * (6) Output page images are at 300 ppi and are stored in memory. + * They are about 1 Mpixel when uncompressed. For up to 200 pages, + * the images are stored uncompressed; otherwise, the stored + * images are compressed with tiffg4. * */ l_ok @@ -263,8 +270,8 @@ cropFilesToPdf(SARRAY *sa, l_int32 lr_clear, l_int32 tb_clear, l_int32 edgeclean, - l_int32 lr_add, - l_int32 tb_add, + l_int32 lr_border, + l_int32 tb_border, l_float32 maxwiden, l_int32 printwiden, const char *title, @@ -296,7 +303,8 @@ PIXAC *pixac1 = NULL; fname = sarrayGetString(sa, i, L_NOCOPY); pixs = pixRead(fname); pix1 = pixCropImage(pixs, lr_clear, tb_clear, edgeclean, - lr_add, tb_add, maxwiden, printwiden, NULL, NULL); + lr_border, tb_border, maxwiden, printwiden, + NULL, NULL); if (n <= maxsmallset) pixaAddPix(pixa1, pix1, L_INSERT); else