Skip to content

Commit

Permalink
Modify prog/croppdf.c and prog/compresspdf.c to use pdf rendering fun…
Browse files Browse the repository at this point in the history
…ction

* This cleans up and simplifies these functions.
  • Loading branch information
DanBloomberg committed Jul 21, 2024
1 parent 0cadcdf commit 2b1f02a
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 181 deletions.
2 changes: 1 addition & 1 deletion prog/cleanpdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@
* * To get an output filename with spaces, use single quotes; e.g.,
* cleanpdf dir [...] title 'quoted filename with spaces'
*
* N.B. This requires running pdfimages from the Poppler package
* N.B. This requires running pdftoppm from the Poppler package
* of pdf utilities. For non-unix systems, this requires
* installation of the cygwin Poppler package:
* https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/
Expand Down
108 changes: 28 additions & 80 deletions prog/compresspdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,6 @@
* be both set to 1. Then the pages with color are compressed with DCT
* and the monochrome pages are compressed with tiffg4.
*
* The first step is to render the images as RGB, using Poppler's pdftoppm.
* Compare compresspdf with cleanpdf, which carries out several cleanup
* operations, such as deskewing and adaptive thresholding to clean
* noisy or dark backgrounds in grayscale or color images, resulting
* in high resolution, 1 bpp tiffg4 encoded images in the pdf.
*
* Syntax:
* compresspdf basedir imres scalefactor onebit savecolor
* quality title fileout
Expand All @@ -65,11 +59,11 @@
*
* The %scalefactor is typically used to downscale the image to
* reduce the size of the generated pdf. It should not affect the
* pdf display otherwise. For normal text on images scanned at 300 ppi,
* a 2x reduction (%scalefactor = 0.5) may be satisfactory.
* We compute an output resolution for that pdf that will cause it
* to print 11 inches high, based on the height in pixels of the
* first image in the set.
* pdf display otherwise. The maximum allowed value is 2.0.
* For normal text on images scanned at 300 ppi, a 2x reduction
* (%scalefactor = 0.5) may be satisfactory. We compute an output
* resolution for that pdf that will cause it to print 11 inches high,
* based on the height in pixels of the first image in the set.
*
* Images are saved in the ./image directory as RGB in ppm format.
* If the %onebit flag is 0, these will be encoded in the output pdf
Expand All @@ -92,7 +86,11 @@
* The pdf output is written to %fileout. It is advisable (but not
* required) to have a '.pdf' extension.
*
* The intent is to use pdftoppm to render the images at 150 pixels/inch
* As the first step in processing, images are saved in the directory
* /tmp/lept/renderpdf/, as RGB at 300 ppi in ppm format. Each image
* is about 26MB.
*
* We use pdftoppm to render the images at (typically) 150 pixels/inch
* for a full page, when scalefactor = 1.0. The renderer uses the
* mediaboxes to decide how big to make the images. If those boxes
* have values that are too large, the intermediate ppm images can
Expand All @@ -101,8 +99,8 @@
* at about 150 ppi (when scalefactor = 1.0). These images are about
* 6MB, but are written quickly because there is no compression.
*
* N.B. This requires the Poppler package of pdf utilities, such as
* pdfimages and pdftoppm. For non-unix systems, this requires
* N.B. This requires running pdftoppm from the Poppler package
* of pdf utilities For non-unix systems, this requires
* installation of the cygwin Poppler package:
* https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/
* poppler-0.26.5-1
Expand All @@ -112,27 +110,16 @@
#include <config_auto.h>
#endif /* HAVE_CONFIG_H */

#ifdef _WIN32
# if defined(_MSC_VER) || defined(__MINGW32__)
# include <direct.h>
# else
# include <io.h>
# endif /* _MSC_VER || __MINGW32__ */
#endif /* _WIN32 */

#include "string.h"
#include <sys/stat.h>
#include <sys/types.h>
#include "allheaders.h"

l_int32 main(int argc,
char **argv)
{
char buf[256];
char *basedir, *fname, *tail, *basename, *imagedir, *title, *fileout;
l_int32 imres, render_res, onebit, savecolor, quality, i, n, ret;
char *basedir, *title, *fileout;
l_int32 imres, render_res, onebit, savecolor, quality;
l_float32 scalefactor;
SARRAY *sa;
SARRAY *safiles;

if (argc != 9)
return ERROR_INT(
Expand All @@ -146,13 +133,18 @@ SARRAY *sa;
quality = atoi(argv[6]); /* jpeg quality */
title = argv[7];
fileout = argv[8];
setLeptDebugOK(1);
if (imres == 0) imres = 150; /* default value */
if (imres <= 0) imres = 150; /* default value */
if (imres != 150 && imres != 300) {
L_WARNING("imres = %d must be 150 or 300; setting to 150\n",
__func__, imres);
imres = 150;
}
if (scalefactor <= 0.0) scalefactor = 1.0;
if (scalefactor > 2.0) {
L_WARNING("scalefactor %f too big; setting to 2.0\n", __func__,
scalefactor);
scalefactor = 2.0;
}
if (quality <= 0) quality = 50; /* default value */
if (quality < 25) {
L_WARNING("quality = %d is too low; setting to 25\n",
Expand All @@ -164,53 +156,12 @@ SARRAY *sa;
__func__, quality);
quality = 95;
}
setLeptDebugOK(1);

/* Set up a directory for temp images */
if ((imagedir = stringJoin(basedir, "/image")) == NULL)
return ERROR_INT_1("imagedir from basedir not found", basedir,
/* Render all images from pdfs */
if (l_pdfRenderFiles(basedir, NULL, imres, &safiles))
return ERROR_INT_1("rendering failed from basedir", basedir,
__func__, 1);
#ifndef _WIN32
mkdir(imagedir, 0777);
#else
_mkdir(imagedir);
#endif /* _WIN32 */

/* Get the names of the pdf files */
if ((sa = getSortedPathnamesInDirectory(basedir, "pdf", 0, 0)) == NULL)
return ERROR_INT("files not found", __func__, 1);
sarrayWriteStderr(sa);
n = sarrayGetCount(sa);

/* Use the first pdf file in the directory to estimate the
* resolution to use with the image renderer that will generate
* page images with a resolution of either about 150 ppi
* (which is the default) or about 300 ppi for special cases.
* At 150 and 300 ppi, the page images have maximum dimensions
* of about 1650 and 3300 pixels, respectively. These are the
* uncompressed images, written to file, from which the compressed
* images will be generated. */
fname = sarrayGetString(sa, 0, L_NOCOPY);
getPdfRendererResolution(fname, imagedir, &render_res); /* for 300 ppi */
if (imres == 150) render_res /= 2;

/* Rasterize:
* pdftoppm -r 150 fname outroot [max dimension about 1650 pixels]
* pdftoppm -r 300 fname outroot [max dimension about 3300 pixels]
* Use of pdftoppm:
* This works on all pdf pages, both wrapped images and pages that
* were made orthographically. */
for (i = 0; i < n; i++) {
fname = sarrayGetString(sa, i, L_NOCOPY);
splitPathAtDirectory(fname, NULL, &tail);
splitPathAtExtension(tail, &basename, NULL);
snprintf(buf, sizeof(buf), "pdftoppm -r %d %s %s/%s",
render_res, fname, imagedir, basename);
lept_free(tail);
lept_free(basename);
lept_stderr("%s\n", buf);
callSystemDebug(buf);
}
sarrayDestroy(&sa);

/* Optionally binarize, then scale and collect all images in memory.
* If n > 100, use pixacomp instead of pixa to store everything
Expand All @@ -219,13 +170,10 @@ SARRAY *sa;
* the images in leptonica. Do not let 'pdftoppm -mono' do
* the binarization, because it will apply error-diffusion
* dithering to gray and color images. */
sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0);
lept_free(imagedir);
sarrayWriteStderr(sa);
lept_stderr("compressing ...\n");
compressFilesToPdf(sa, onebit, savecolor, scalefactor, quality,
compressFilesToPdf(safiles, onebit, savecolor, scalefactor, quality,
title, fileout);
sarrayDestroy(&sa);
sarrayDestroy(&safiles);
return 0;
}

130 changes: 46 additions & 84 deletions prog/croppdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,29 +34,34 @@
* scales the width to fill a printed page. See documentation for
* pixCropImage() for the parameters.
*
* The pdfs are concatenated in lexical order, and each image
* is encoded with tiffg4.
* The pdfs are concatenated in lexical order. Each image is 1 bpp
* and is encoded with tiffg4.
*
* Syntax:
* croppdf basedir lrclear tbclear edgeclean lradd tbadd maxwiden
* printwiden title fileout
* croppdf basedir lrclear tbclear edgeclean lrborder tbborder
* maxwiden printwiden title fileout
*
* The %basedir is a directory where the input pdf files are located.
* Typical parameters for an invocation are:
* croppdf . 50 50 5 70 70 1.12 1.1 none <output-file-name>
*
* Parameter %basedir is a directory where the input pdf files are located.
* The program will operate on every file in this directory with
* the ".pdf" extension.
* the ".pdf" extension, taking them in lexical order.
*
* The %lrclear and %tbclear parameters give the number of background
* pixels to be added to the foreground region.
* Parameter %lrclear and %tbclear parameters give the width of the
* regions at the left-right and top-bottom edges of the input image
* that are cleared to background as first step in the processing.
*
* The %edgeclean parameter is used to remove edge noise:
* The %edgeclean parameter is used to remove noise that is typically
* near the edges of the image:
* -1: aggressively removes left and right side noise
* 0: default, no removal
* 1-15: removal of random noise, where 15 is maximally aggressive
*
* The suggested value for %lradd and %tbadd is 50. Laser printers do not
* print foreground pixels very close to the page edges, and using a
* margin of 50 pixels (1/6" at 300 ppi) should allow all foregrounnd
* pixels to be printed.
* The suggested value for %lrborder and %tbborder is 70.
* Laser printers do not print foreground pixels very close to the
* page edges, and using a margin of 70 pixels (about 1/4" at 300 ppi)
* will allow all foregrounnd pixels to be printed.
*
* The %maxwiden parameter allows the foreground to better fill an
* 8.5 x 11 inch printed page. It gives the maximum fractional horizontal
Expand All @@ -73,12 +78,21 @@
* The pdf output is written to %fileout. It is advisable (but not
* required) to have a '.pdf' extension.
*
* As the first step in processing, images are saved in the ./image
* directory as RGB at 300 ppi in ppm format. Each image is about 26MB.
* Delete those images after use.
* The first processing step is render images from the pdf as $GB
* at 300 ppi in ppm format, and to seve them in the directory
* /tmp/lept/renderpdf/.
*
* N.B. This requires the Poppler package of pdf utilities, such as
* pdfimages and pdftoppm. For non-unix systems, this requires
* We use pdftoppm to render the images at 300 pixels/inch for a
* full page. The renderer uses the mediaboxes to decide how big
* to make the images. If those boxes have values that are too large,
* the intermediate ppm images can be very large. To prevent that,
* we compute the resolution to input to pdftoppm that results
* in RGB ppm images representing page images at about 300 ppi.
* These images are about 25MB, and are written quickly because
* there is no compression.
* N.B. This requires running pdftoppm from the Poppler package
* of pdf utilities. For non-unix systems, this requires
* installation of the cygwin Poppler package:
* https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/
* poppler-0.26.5-1
Expand All @@ -88,99 +102,47 @@
#include <config_auto.h>
#endif /* HAVE_CONFIG_H */

#ifdef _WIN32
# if defined(_MSC_VER) || defined(__MINGW32__)
# include <direct.h>
# else
# include <io.h>
# endif /* _MSC_VER || __MINGW32__ */
#endif /* _WIN32 */

#include "string.h"
#include <sys/stat.h>
#include <sys/types.h>
#include "allheaders.h"

l_int32 main(int argc,
char **argv)
{
char buf[256];
char *basedir, *fname, *tail, *basename, *imagedir, *title, *fileout;
l_int32 lrclear, tbclear, edgeclean, lradd, tbadd, printwiden;
l_int32 render_res, i, n, ret;
char *basedir, *title, *fileout;
l_int32 lrclear, tbclear, edgeclean, lrborder, tbborder;
l_int32 printwiden, render_res;
l_float32 maxwiden;
SARRAY *sa;
SARRAY *safiles;

if (argc != 11)
return ERROR_INT(
"Syntax: croppdf basedir lrclear tbclear edgeclean "
"lradd tbadd maxwiden printwiden title fileout", __func__, 1);
"lrborder tbborder maxwiden printwiden title fileout", __func__, 1);
basedir = argv[1];
lrclear = atoi(argv[2]);
tbclear = atoi(argv[3]);
edgeclean = atoi(argv[4]);
lradd = atoi(argv[5]);
tbadd = atoi(argv[6]);
lrborder = atoi(argv[5]);
tbborder = atoi(argv[6]);
maxwiden = atof(argv[7]);
printwiden = atoi(argv[8]);
title = argv[9];
fileout = argv[10];

setLeptDebugOK(1);

/* Set up a directory for temp images */
if ((imagedir = stringJoin(basedir, "/image")) == NULL)
return ERROR_INT_1("imagedir from basedir not found", basedir,
/* Render all images from pdfs */
if (l_pdfRenderFiles(basedir, NULL, 300, &safiles))
return ERROR_INT_1("rendering failed from basedir", basedir,
__func__, 1);
#ifndef _WIN32
mkdir(imagedir, 0777);
#else
_mkdir(imagedir);
#endif /* _WIN32 */

/* Get the names of the pdf files */
if ((sa = getSortedPathnamesInDirectory(basedir, "pdf", 0, 0)) == NULL)
return ERROR_INT("files not found", __func__, 1);
sarrayWriteStderr(sa);
n = sarrayGetCount(sa);

/* Figure out the resolution to use with the image renderer to
* generate page images with a resolution of not more than 150 ppi.
* These would have a maximum dimension of about 1650 pixels.
* Use the first pdf file in the directory. */
fname = sarrayGetString(sa, 0, L_NOCOPY);
getPdfRendererResolution(fname, imagedir, &render_res); /* for 300 ppi */

/* Rasterize:
* pdftoppm -r 300 fname outroot
* Use of pdftoppm:
* This works on all pdf pages, both wrapped images and pages that
* were made orthographically. We generate images that are no
* larger than about 1650 pixels in the maximum direction. This
* makes uncompressed 6 MB files and is very fast. If you want
* higher resolution 1 bpp output, use cleanpdf.c. */
for (i = 0; i < n; i++) {
fname = sarrayGetString(sa, i, L_NOCOPY);
splitPathAtDirectory(fname, NULL, &tail);
splitPathAtExtension(tail, &basename, NULL);
snprintf(buf, sizeof(buf), "pdftoppm -r %d %s %s/%s",
render_res, fname, imagedir, basename);
lept_free(tail);
lept_free(basename);
lept_stderr("%s\n", buf);
callSystemDebug(buf);
}
sarrayDestroy(&sa);

/* Process each image and collect all resulting 1 bpp images
* in memory. If n > 200, use pixacomp instead of pixa to
* store the images before generating the pdf. */
sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0);
lept_free(imagedir);
sarrayWriteStderr(sa);
lept_stderr("cropping ...\n");
cropFilesToPdf(sa, lrclear, tbclear, edgeclean, lradd, tbadd, maxwiden,
printwiden, title, fileout);
sarrayDestroy(&sa);
cropFilesToPdf(safiles, lrclear, tbclear, edgeclean, lrborder, tbborder,
maxwiden, printwiden, title, fileout);
sarrayDestroy(&safiles);
return 0;
}

Loading

0 comments on commit 2b1f02a

Please sign in to comment.