Skip to content

Commit

Permalink
New pdfapp.c and pdfappstub.c.
Browse files Browse the repository at this point in the history
* These have functions compressFilesToPdf() and cleanTo1bppFilesToPdf(),
  which hold the image processing functionality that was previously in
  prog/cleanpdf and prog/compresspdf.  These no longer write intermediate
  image files; they store everything in either a pixa (n <= 100 images)
  or a pixac (n > 100 images).
* Input to these functions is an SArray of image file paths.
* The programs pull images out of a set of input pdfs, and put them
  uncompressed in a set of files.  The new functions do some image
  processing and wrap the results up in a single output pdf.
* Also, rename concatpdf --> compresspdf, because all these functions,
  and others to be written, will do concatenation of input pdf files.
* Also, minor documentation fix in gplot.c.
  • Loading branch information
DanBloomberg committed Aug 13, 2023
1 parent 1cfce33 commit 6c15bd5
Show file tree
Hide file tree
Showing 13 changed files with 551 additions and 213 deletions.
2 changes: 1 addition & 1 deletion prog/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ add_prog_target(colorsegtest colorsegtest.c)
add_prog_target(comparepages comparepages.c)
add_prog_target(comparepixa comparepixa.c)
add_prog_target(comparetest comparetest.c)
add_prog_target(concatpdf concatpdf.c)
add_prog_target(compresspdf compresspdf.c)
add_prog_target(contrasttest contrasttest.c)
add_prog_target(convertfilestopdf convertfilestopdf.c)
add_prog_target(convertfilestops convertfilestops.c)
Expand Down
2 changes: 1 addition & 1 deletion prog/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ OTHER_PROGS = adaptmap_dark \
blendcmaptest buffertest \
ccbordtest cctest1 cleanpdf \
colorsegtest comparepages comparepixa \
comparetest concatpdf \
comparetest compresspdf \
contrasttest converttogray \
cornertest corrupttest croptext deskew_it \
dewarprules dewarptest1 dewarptest2 \
Expand Down
126 changes: 13 additions & 113 deletions prog/cleanpdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
*
* Syntax:
* cleanpdf basedir threshold resolution
* darken rotation opensize title outfile
* darken rotation opensize title fileout
*
* A typical command is:
* cleanpdf . 180 300 0 0 0 none <name-of-output-pdf-file>
Expand Down Expand Up @@ -89,7 +89,7 @@
* The %title is the title given to the pdf. Use %title == "none"
* to omit the title.
*
* The pdf output is written to %outfile. It is advisable (but not
* The pdf output is written to %fileout. It is advisable (but not
* required) to have a '.pdf' extension.
*
* Whenever possible, the images will be deskewed.
Expand Down Expand Up @@ -148,24 +148,24 @@
#include <sys/types.h>
#include "allheaders.h"

#if 0
/* Special version */
PIX *pixConvertTo8Special(PIX *pix);
#endif

l_int32 main(int argc,
char **argv)
{
char buf[256], sequence[32];
char buf[256];
char *basedir, *fname, *tail, *basename, *imagedir, *firstfile, *title;
char *outfile, *firstpath;
char *fileout;
l_int32 thresh, res, render_res, rotation, darken, opensize, i, n, ret;
l_int32 medw, medh, medmax, npages, pageno, w, h;
PIX *pixs, *pix1, *pix2, *pix3, *pix4, *pix5, *pix6;
SARRAY *sa, *sa1;
SARRAY *sa;

if (argc != 9)
return ERROR_INT(
"\n Syntax: cleanpdf basedir threshold resolution "
"darken rotation opensize title outfile",
"darken rotation opensize title fileout",
__func__, 1);
basedir = argv[1];
thresh = atoi(argv[2]);
Expand All @@ -174,7 +174,7 @@ SARRAY *sa, *sa1;
rotation = atoi(argv[5]);
opensize = atoi(argv[6]);
title = argv[7];
outfile = argv[8];
fileout = argv[8];
if (thresh > 190) {
L_WARNING("threshold = %d is too large; reducing to 190\n",
__func__, thresh);
Expand Down Expand Up @@ -272,110 +272,10 @@ SARRAY *sa, *sa1;

/* Clean, deskew and compress */
sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0);
sarrayWriteStderr(sa);
n = sarrayGetCount(sa);
firstpath = NULL;
for (i = 0; i < n; i++) {
PIX *pix6;
fname = sarrayGetString(sa, i, L_NOCOPY);
pixs = pixRead(fname);
pix1 = pixConvertTo8Special(pixs);
if (rotation > 0)
pix2 = pixRotateOrth(pix1, rotation);
else
pix2 = pixClone(pix1);
pix3 = pixFindSkewAndDeskew(pix2, 2, NULL, NULL);
pix4 = pixBackgroundNormSimple(pix3, NULL, NULL);
if (darken == 0)
pixGammaTRC(pix4, pix4, 2.0, 50, 220);
else if (darken == 1)
pixGammaTRC(pix4, pix4, 1.8, 60, 215);
else if (darken == 2)
pixGammaTRC(pix4, pix4, 1.6, 70, 215);
else if (darken == 3)
pixGammaTRC(pix4, pix4, 1.4, 80, 210);
else if (darken == 4)
pixGammaTRC(pix4, pix4, 1.2, 90, 210);
else if (darken == 5)
pixGammaTRC(pix4, pix4, 1.0, 100, 210);
else if (darken == 6)
pixGammaTRC(pix4, pix4, 0.85, 110, 205);
else if (darken == 7)
pixGammaTRC(pix4, pix4, 0.7, 120, 205);
else if (darken == 8)
pixGammaTRC(pix4, pix4, 0.6, 130, 200);
else /* darken == 9 */
pixGammaTRC(pix4, pix4, 0.5, 140, 195);
if (res == 300)
pix5 = pixThresholdToBinary(pix4, thresh);
else /* res == 600 */
pix5 = pixScaleGray2xLIThresh(pix4, thresh);
if (opensize == 2 || opensize == 3) {
snprintf(sequence, sizeof(sequence), "o%d.%d", opensize, opensize);
pix6 = pixMorphSequence(pix5, sequence, 0);
} else {
pix6 = pixClone(pix5);
}
splitPathAtDirectory(fname, NULL, &tail);
splitPathAtExtension(tail, &basename, NULL);
snprintf(buf, sizeof(buf), "%s/%s.tif", imagedir, basename);
lept_stderr("%s\n", buf);
pixWrite(buf, pix6, IFF_TIFF_G4);
if (i == 0) /* save full path to first image */
firstpath = stringNew(buf);
pixDestroy(&pixs);
pixDestroy(&pix1);
pixDestroy(&pix2);
pixDestroy(&pix3);
pixDestroy(&pix4);
pixDestroy(&pix5);
pixDestroy(&pix6);
lept_free(tail);
lept_free(basename);
}
sarrayDestroy(&sa);

/* Generate the pdf. Compute the actual input resolution from
* the pixel dimensions of the first image. This will cause each
* page to be printed to cover an 8.5 x 11 inch sheet of paper. */
lept_stderr("Write output to %s\n", outfile);
pix1 = pixRead(firstpath);
pixInferResolution(pix1, 11.0, &res);
pixDestroy(&pix1);
lept_free(firstpath);
if (strcmp(title, "none") == 0)
title = NULL;
convertFilesToPdf(imagedir, "tif", res, 1.0, L_G4_ENCODE,
0, title, outfile);
lept_free(imagedir);
sarrayWriteStderr(sa);
lept_stderr("cleaning ...\n");
cleanTo1bppFilesToPdf(sa, thresh, res, darken, rotation, opensize,
title, fileout);
return 0;
}


/* A special version of pixConvertTo8() that returns an image without
* a colormap and uses pixConvertRGBToGrayMinMax() to strongly
* render color into black. */
PIX *
pixConvertTo8Special(PIX *pixs)
{
l_int32 d = pixGetDepth(pixs);
if (d == 1) {
return pixConvert1To8(NULL, pixs, 255, 0);
} else if (d == 2) {
return pixConvert2To8(pixs, 0, 85, 170, 255, FALSE);
} else if (d == 4) {
return pixConvert4To8(pixs, FALSE);
} else if (d == 8) {
if (pixGetColormap(pixs) != NULL)
return pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
else
return pixCopy(NULL, pixs);
} else if (d == 16) {
return pixConvert16To8(pixs, L_MS_BYTE);
} else if (d == 32) {
return pixConvertRGBToGrayMinMax(pixs, L_CHOOSE_MIN);
}

L_ERROR("Invalid depth d = %d\n", "pixConvertSpecialTo8", d);
return NULL;
}
122 changes: 31 additions & 91 deletions prog/concatpdf.c → prog/compresspdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,30 +25,31 @@
*====================================================================*/

/*
* concatpdf.c
* compresspdf.c
*
* This program concatenates all pdfs in a directory by rendering them
* as images, optionally scaling the images, and generating an output pdf.
* The pdfs are taken in lexical order. It chooses the same encoding
* for all pages, which is either tiffg4 or jpeg (DCT). For the latter,
* the jpeg quality factor can be used to trade off the size of the
* resulting pdf against the image quality.
* The pdfs are taken in lexical order. Pages are encoded with either
* tiffg4 or jpeg (DCT), or a mixture of them depending on input parameters
* and page color content. For DCT encoding, the jpeg quality factor
* can be used to trade off the size of the resulting pdf against
* the image quality.
*
* If the pages are monochrome (black and white), use of the %one-bit
* If the pages are monochrome (black and white), use of the %onebit
* flag will achieve better compression with less distortion.
* If most of the pages are black and white, but some have color that
* needs to be saved, input parameters %one_bit and %save_color should
* needs to be saved, input parameters %onebit and %savecolor should
* be both set to 1. Then the pages with color are compressed with DCT
* and the monochrome pages are compressed with tiffg4.
*
* The first step is to render the images as RGB, using Poppler's pdftoppm.
* Compare concatpdf with cleanpdf, which carries out several cleanup
* Compare compresspdf with cleanpdf, which carries out several cleanup
* operations, such as deskewing and adaptive thresholding to clean
* noisy or dark backgrounds in grayscale or color images, which results
* noisy or dark backgrounds in grayscale or color images, resulting
* in high resolution, 1 bpp tiffg4 encoded images in the pdf.
*
* Syntax:
* concatpdf basedir scalefactor one_bit save_color quality title outfile
* compresspdf basedir scalefactor onebit savecolor quality title fileout
*
* The %basedir is a directory where the input pdf files are located.
* The program will operate on every file in this directory with
Expand All @@ -63,12 +64,12 @@
* first image in the set.
*
* Images are saved in the ./image directory as RGB in ppm format.
* If the %one_bit flag is 0, these will be encoded in the output pdf
* If the %onebit flag is 0, these will be encoded in the output pdf
* using DCT. To force the images to be 1 bpp, with tiffg4 encoding, set
* the $one-bit flag to 1.
* the $onebit flag to 1.
*
* The %save_color flag is ignored unless %one_bit is 1. In that case,
* if %save_color is 1, the image is tested for color content, and if
* The %savecolor flag is ignored unless %onebit is 1. In that case,
* if %savecolor is 1, the image is tested for color content, and if
* even a relatively small amount is found, the image will be encoded
* with DCT instead of tiffg4.
*
Expand All @@ -80,7 +81,7 @@
* The %title is the title given to the pdf. Use %title == "none"
* to omit the title.
*
* The pdf output is written to %outfile. It is advisable (but not
* The pdf output is written to %fileout. It is advisable (but not
* required) to have a '.pdf' extension.
*
* The intent is to use pdftoppm to render the images at 150 pixels/inch
Expand Down Expand Up @@ -120,25 +121,22 @@ l_int32 main(int argc,
char **argv)
{
char buf[256];
char *basedir, *fname, *tail, *basename, *imagedir, *title, *outfile;
l_int32 res, render_res, one_bit, save_color, quality, i, n, ret;
l_float32 scalefactor, colorfract;
PIX *pixs, *pix1, *pix2;
PIXA *pixa1 = NULL;
PIXAC *pixac1 = NULL;
char *basedir, *fname, *tail, *basename, *imagedir, *title, *fileout;
l_int32 render_res, onebit, savecolor, quality, i, n, ret;
l_float32 scalefactor;
SARRAY *sa;

if (argc != 8)
return ERROR_INT(
"Syntax: concatpdf basedir scalefactor one-bit save_color quality "
"title outfile", __func__, 1);
"Syntax: compresspdf basedir scalefactor onebit savecolor quality "
"title fileout", __func__, 1);
basedir = argv[1];
scalefactor = atof(argv[2]);
one_bit = atoi(argv[3]); /* set to 1 to enforce 1 bpp tiffg4 encoding */
save_color = atoi(argv[4]); /* if one_bit == 1, set to 1 to save color */
onebit = atoi(argv[3]); /* set to 1 to enforce 1 bpp tiffg4 encoding */
savecolor = atoi(argv[4]); /* if onebit == 1, set to 1 to save color */
quality = atoi(argv[5]); /* jpeg quality */
title = argv[6];
outfile = argv[7];
fileout = argv[7];
setLeptDebugOK(1);
if (quality <= 0) quality = 50; /* default value */
if (quality < 25) {
Expand Down Expand Up @@ -196,75 +194,17 @@ SARRAY *sa;
/* Optionally binarize, then scale and collect all images in memory.
* If n > 100, use pixacomp instead of pixa to store everything
* before generating the pdf.
* If we want to use the one-bit option so that the images are
* encoded with tiffg4, it is necessary to binarize here.
* Do not let 'pdftoppm -mono' do the binarization, because it will
* apply error-diffusion dithering on gray or color images. */
* When using the onebit option, It is important to binarize
* the images in leptonica. Do not let 'pdftoppm -mono' do
* the binarization, because it will apply error-diffusion
* dithering to gray and color images. */
sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0);
lept_free(imagedir);
sarrayWriteStderr(sa);
n = sarrayGetCount(sa);
if (n <= 100)
pixa1 = pixaCreate(n);
else
pixac1 = pixacompCreate(n);
for (i = 0; i < n; i++) {
if (i == 0)
lept_stderr("page: ");
else if (i % 10 == 0)
lept_stderr("%d . ", i);
fname = sarrayGetString(sa, i, L_NOCOPY);
pixs = pixRead(fname);
if (one_bit) {
if (save_color) {
pixColorFraction(pixs, 40, 224, 80, 4, NULL, &colorfract);
if (colorfract > 0.01) /* save the color */
pix1 = pixClone(pixs);
else
pix1 = pixConvertTo1(pixs, 180);
} else { /* do not save any color */
pix1 = pixConvertTo1(pixs, 180);
}
} else { /* DCT encoding for all images */
pix1 = pixClone(pixs);
}
if (scalefactor == 1.0)
pix2 = pixClone(pix1);
else
pix2 = pixScale(pix1, scalefactor, scalefactor);
if (n <= 100) {
pixaAddPix(pixa1, pix2, L_INSERT);
} else {
pixacompAddPix(pixac1, pix2, IFF_DEFAULT);
pixDestroy(&pix2);
}
pixDestroy(&pixs);
pixDestroy(&pix1);
}
sarrayDestroy(&sa);
lept_stderr("compressing ...\n");
compressFilesToPdf(sa, onebit, savecolor, scalefactor, quality,
title, fileout);

/* Generate the pdf. Compute the actual input resolution from
* the pixel dimensions of the first image. This will cause each
* page to be printed to cover an 8.5 x 11 inch sheet of paper. */
lept_stderr("\nWrite output to %s\n", outfile);
if (n <= 100)
pix1 = pixaGetPix(pixa1, 0, L_CLONE);
else
pix1 = pixacompGetPix(pixac1, 0);
pixInferResolution(pix1, 11.0, &res);
pixDestroy(&pix1);
if (strcmp(title, "none") == 0)
title = NULL;
if (n <= 100) {
pixaConvertToPdf(pixa1, res, 1.0, L_DEFAULT_ENCODE, quality,
title, outfile);
pixaDestroy(&pixa1);
} else {
pixacompConvertToPdf(pixac1, res, 1.0, L_DEFAULT_ENCODE, quality,
title, outfile);
pixacompDestroy(&pixac1);
}
return 0;
}


6 changes: 3 additions & 3 deletions prog/makefile.static
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ SRC = adaptmap_reg.c adaptnorm_reg.c affine_reg.c \
ccbordtest.c cctest1.c \
cleanpdf.c colorsegtest.c comparepages.c \
comparepixa.c comparetest.c \
concatpdf.c contrasttest.c \
compresspdf.c contrasttest.c \
convertfilestopdf.c convertfilestops.c \
convertformat.c \
convertsegfilestopdf.c convertsegfilestops.c \
Expand Down Expand Up @@ -851,8 +851,8 @@ comparepixa: comparepixa.o $(LEPTLIB)
comparetest: comparetest.o $(LEPTLIB)
$(CC) -o comparetest comparetest.o $(ALL_LIBS) $(EXTRALIBS)

concatpdf: concatpdf.o $(LEPTLIB)
$(CC) -o concatpdf concatpdf.o $(ALL_LIBS) $(EXTRALIBS)
compresspdf: compresspdf.o $(LEPTLIB)
$(CC) -o compresspdf compresspdf.o $(ALL_LIBS) $(EXTRALIBS)

contrasttest: contrasttest.o $(LEPTLIB)
$(CC) -o contrasttest contrasttest.o $(ALL_LIBS) $(EXTRALIBS)
Expand Down
Loading

0 comments on commit 6c15bd5

Please sign in to comment.