From 462126136a76b061fdebf9f6eb6a033e7d60f72b Mon Sep 17 00:00:00 2001 From: danblooomberg Date: Wed, 30 Aug 2023 20:37:09 -0700 Subject: [PATCH] Pull new pixCleanImage() out of cleanTo1bppFilesToPdf() * Also, allow for the input to be 1 bpp. * Test the new function for 1 bpp and grayscale in prog/misctest1 --- prog/cleanpdf.c | 1 + prog/misctest1.c | 24 +++++++++++ src/allheaders.h | 1 + src/pageseg.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++- src/pdfapp.c | 47 +++++++-------------- 5 files changed, 146 insertions(+), 33 deletions(-) diff --git a/prog/cleanpdf.c b/prog/cleanpdf.c index 399568a5f..d55133049 100644 --- a/prog/cleanpdf.c +++ b/prog/cleanpdf.c @@ -263,5 +263,6 @@ SARRAY *sa; lept_stderr("cleaning ...\n"); cleanTo1bppFilesToPdf(sa, res, contrast, rotation, opensize, title, fileout); + sarrayDestroy(&sa); return 0; } diff --git a/prog/misctest1.c b/prog/misctest1.c index 0272f56d6..93ae0d1c6 100644 --- a/prog/misctest1.c +++ b/prog/misctest1.c @@ -41,6 +41,7 @@ * * Display differences in images with pixDisplayDiff() * * Demonstrate read of cmap+alpha png, and I/O of rgba pnm, bmp, webp * * Demonstrate image cropping function + * * Demonstrate image cleaning function */ #ifdef HAVE_CONFIG_H @@ -420,5 +421,28 @@ PIXCMAP *cmap, *cmapg; pixDestroy(&pix1); pixDestroy(&pix2); + /* Page cleaning */ + pixa1 = pixaCreate(3); + pix1 = pixRead("tel_3.tif"); + pix2 = pixRotate(pix1, 0.02, L_ROTATE_SAMPLING, L_BRING_IN_WHITE, 0, 0); + pix3 = pixCleanImage(pix2, 1, 0, 1, 0); + pixaAddPix(pixa1, pix3, L_INSERT); + pixDisplay(pix3, 800, 800); + pixDestroy(&pix1); + pixDestroy(&pix2); + pix1 = pixRead("w91frag.jpg"); + pixaAddPix(pixa1, pixScale(pix1, 2.5, 2.5), L_INSERT); + pix2 = pixRotate(pix1, 0.02, L_ROTATE_AREA_MAP, L_BRING_IN_WHITE, 0, 0); + pix3 = pixCleanImage(pix2, 1, 0, 1, 0); + pixaAddPix(pixa1, pixScale(pix3, 2.5, 2.5), L_INSERT); + pixDisplay(pix3, 1200, 800); + pixDestroy(&pix1); + pixDestroy(&pix2); + pixDestroy(&pix3); + lept_stderr("Writing /tmp/lept/misc/pageclean.pdf\n"); + pixaConvertToPdf(pixa1, 0, 1.0, L_DEFAULT_ENCODE, 50, NULL, + "/tmp/lept/misc/pageclean.pdf"); + pixaDestroy(&pixa1); + return 0; } diff --git a/src/allheaders.h b/src/allheaders.h index 10e8bfadd..8320f89f9 100644 --- a/src/allheaders.h +++ b/src/allheaders.h @@ -1376,6 +1376,7 @@ LEPT_DLL extern PIX * pixGenerateHalftoneMask ( PIX *pixs, PIX **ppixtext, l_int LEPT_DLL extern PIX * pixGenTextlineMask ( PIX *pixs, PIX **ppixvws, l_int32 *ptlfound, PIXA *pixadb ); LEPT_DLL extern PIX * pixGenTextblockMask ( PIX *pixs, PIX *pixvws, PIXA *pixadb ); LEPT_DLL extern PIX * pixCropImage ( PIX *pixs, l_int32 lr_clear, l_int32 tb_clear, l_int32 edgeclean, l_int32 lr_add, l_int32 tb_add, const char *debugfile, BOX **pcropbox ); +LEPT_DLL extern PIX * pixCleanImage ( PIX *pixs, l_int32 contrast, l_int32 rotation, l_int32 scale, l_int32 opensize ); LEPT_DLL extern BOX * pixFindPageForeground ( PIX *pixs, l_int32 threshold, l_int32 mindist, l_int32 erasedist, l_int32 showmorph, PIXAC *pixac ); LEPT_DLL extern l_ok pixSplitIntoCharacters ( PIX *pixs, l_int32 minw, l_int32 minh, BOXA **pboxa, PIXA **ppixa, PIX **ppixdebug ); LEPT_DLL extern BOXA * pixSplitComponentWithProfile ( PIX *pixs, l_int32 delta, l_int32 mindel, PIX **ppixdebug ); diff --git a/src/pageseg.c b/src/pageseg.c index 2c4f03a26..92109fefd 100644 --- a/src/pageseg.c +++ b/src/pageseg.c @@ -42,8 +42,9 @@ * Textblock extraction * PIX *pixGenTextblockMask() * - * Location and extraction of page foreground + * Location and extraction of page foreground; cleaning pages * PIX *pixCropImage() + * PIX *pixCleanImage() * BOX *pixFindPageForeground() * * Extraction of characters from image with only text @@ -524,7 +525,7 @@ PIX *pix1, *pix2, *pix3, *pixd; /*------------------------------------------------------------------* - * Location of page foreground * + * Location and extraction of page foreground; cleaning pages * *------------------------------------------------------------------*/ /*! * \brief pixCropImage() @@ -689,6 +690,107 @@ BOX *box1, *box2; } +/*! + * \brief pixCleanImage() + * + * \param[in] pixs full resolution (any type or depth) + * \param[in] contrast vary contrast: 1 = lightest; 10 = darkest; + * suggest 1 unless light features are being lost + * \param[in] rotation cw by 90 degrees: {0,1,2,3} represent + * 0, 90, 180 and 270 degree cw rotations + * \param[in] scale 1 (no scaling) or 2 (2x upscaling) + * \param[in] opensize opening size of structuring element for noise + * removal: {0 or 1 to skip; 2, 3 for opening} + * \return cleaned pix, or NULL on error + * + *
+ * Notes:
+ *    (1) This deskews, optionally rotates and darkens, cleans background
+ *        to white, binarizes and optionally removes small noise.
+ *    (2) For color and grayscale input, local background normalization is
+ *        done to 200, and a threshold of 180 sets the maximum foreground
+ *        value in the normalized image.
+ *    (3) The %contrast parameter adjusts the binarization to avoid losing
+ *        lighter input pixels.  Contrast is increased as %contrast increases
+ *        from 1 to 10.
+ *    (4) The %scale parameter controls the thresholding to 1 bpp. Two values:
+ *            1 = threshold
+ *            2 = linear interpolated 2x upscaling before threshold.
+ *    (5) The #opensize parameter is the size of a square SEL used with
+ *        opening to remove small speckle noise.  Allowed open sizes are 2,3.
+ *        If this is to be used, try 2 before 3.
+ *    (6) This does the image processing for cleanTo1bppFilesToPdf() and
+ *        prog/cleanpdf.c.
+ * 
+ */ +PIX * +pixCleanImage(PIX *pixs, + l_int32 contrast, + l_int32 rotation, + l_int32 scale, + l_int32 opensize) +{ +char sequence[32]; +PIX *pix1, *pix2, *pix3, *pix4, *pix5; + + if (!pixs) + return (PIX *)ERROR_PTR("pixs not defined", __func__, NULL); + if (rotation < 0 || rotation > 3) { + L_ERROR("invalid rotation = %d; rotation must be in {0,1,2,3}\n", + __func__, rotation); + return NULL; + } + if (contrast < 1 || contrast > 10) { + L_ERROR("invalid contrast = %d; contrast must be in [1...10]\n", + __func__, contrast); + return NULL; + } + if (scale != 1 && scale != 2) { + L_ERROR("invalid scale = %d; scale must be 1 or 2\n", + __func__, opensize); + return NULL; + } + if (opensize > 3) { + L_ERROR("invalid opensize = %d; opensize must be <= 3\n", + __func__, opensize); + return NULL; + } + + if (pixGetDepth(pixs) == 1) { + if (rotation > 0) + pix1 = pixRotateOrth(pixs, rotation); + else + pix1 = pixClone(pixs); + pix2 = pixFindSkewAndDeskew(pix1, 2, NULL, NULL); + if (scale == 2) + pix4 = pixExpandBinaryReplicate(pix2, 2, 2); + else /* scale == 1 */ + pix4 = pixClone(pix2); + } else { + pix1 = pixConvertTo8MinMax(pixs); + if (rotation > 0) + pix2 = pixRotateOrth(pix1, rotation); + else + pix2 = pixClone(pix1); + pix3 = pixFindSkewAndDeskew(pix2, 2, NULL, NULL); + pix4 = pixBackgroundNormTo1MinMax(pix3, contrast, scale); + pixDestroy(&pix3); + } + + if (opensize == 2 || opensize == 3) { + snprintf(sequence, sizeof(sequence), "o%d.%d", opensize, opensize); + pix5 = pixMorphSequence(pix4, sequence, 0); + } else { + pix5 = pixClone(pix4); + } + + pixDestroy(&pix1); + pixDestroy(&pix2); + pixDestroy(&pix4); + return pix5; +} + + /*! * \brief pixFindPageForeground() * diff --git a/src/pdfapp.c b/src/pdfapp.c index 3806869ce..4f35b3be5 100644 --- a/src/pdfapp.c +++ b/src/pdfapp.c @@ -324,7 +324,7 @@ PIXAC *pixac1 = NULL; * \param[in] rotation cw by 90 degrees: {0,1,2,3} represent * 0, 90, 180 and 270 degree cw rotations * \param[in] opensize opening size of structuring element for noise - * removal: {0 to skip; 2, 3} + * removal: {0 or 1to skip; 2, 3 for opening} * \param[in] title [optional] pdf title; can be null * \param[in] fileout pdf file of all images * \return 0 if OK, 1 on error @@ -334,21 +334,20 @@ PIXAC *pixac1 = NULL; * (1) This deskews, optionally rotates and darkens, cleans background * to white, binarizes and optionally removes small noise, and * put the images into the pdf in the order given in %sa. - * (2) It does the image processing for prog/cleanpdf.c. - * (3) All images in the pdf are tiffg4 encoded. - * (4) For color and grayscale input, local background normalization is + * (2) All images in the pdf are tiffg4 encoded. + * (3) For color and grayscale input, local background normalization is * done to 200, and a threshold of 180 sets the maximum foreground * value in the normalized image. - * (5) The %res parameter can be either 300 or 600 ppi. If the input + * (4) The %res parameter can be either 300 or 600 ppi. If the input * is gray or color and %res = 600, this does an interpolated 2x * expansion before binarizing. - * (6) The %contrast parameter adjusts the binarization to avoid losing + * (5) The %contrast parameter adjusts the binarization to avoid losing * lighter input pixels. Contrast is increased as %contrast increases * from 1 to 10. - * (7) The #opensize parameter is the size of a square SEL used with + * (6) The #opensize parameter is the size of a square SEL used with * opening to remove small speckle noise. Allowed open sizes are 2,3. * If this is to be used, try 2 before 3. - * (8) If there are more than 200 images, store the images after processing + * (7) If there are more than 200 images, store the images after processing * as an array of compressed images (a Pixac); otherwise, use a Pixa. * */ @@ -361,11 +360,10 @@ cleanTo1bppFilesToPdf(SARRAY *sa, const char *title, const char *fileout) { -char sequence[32]; char *fname; l_int32 n, i, scale; l_int32 maxsmallset = 200; /* max num images kept uncompressed in array */ -PIX *pixs, *pix1, *pix2, *pix3, *pix4, *pix5; +PIX *pixs, *pix1; PIXA *pixa1 = NULL; PIXAC *pixac1 = NULL; @@ -408,31 +406,19 @@ PIXAC *pixac1 = NULL; else if (i % 10 == 0) lept_stderr("%d . ", i); fname = sarrayGetString(sa, i, L_NOCOPY); - pixs = pixRead(fname); - pix1 = pixConvertTo8MinMax(pixs); - if (rotation > 0) - pix2 = pixRotateOrth(pix1, rotation); - else - pix2 = pixClone(pix1); - pix3 = pixFindSkewAndDeskew(pix2, 2, NULL, NULL); - pix4 = pixBackgroundNormTo1MinMax(pix3, contrast, scale); - if (opensize == 2 || opensize == 3) { - snprintf(sequence, sizeof(sequence), "o%d.%d", opensize, opensize); - pix5 = pixMorphSequence(pix4, sequence, 0); - } else { - pix5 = pixClone(pix4); + if ((pixs = pixRead(fname)) == NULL) { + L_ERROR("pixs not read from %s\n", __func__, fname); + continue; } + + pix1 = pixCleanImage(pixs, contrast, rotation, scale, opensize); if (n <= maxsmallset) { - pixaAddPix(pixa1, pix5, L_INSERT); + pixaAddPix(pixa1, pix1, L_INSERT); } else { - pixacompAddPix(pixac1, pix5, IFF_TIFF_G4); - pixDestroy(&pix5); + pixacompAddPix(pixac1, pix1, IFF_TIFF_G4); + pixDestroy(&pix1); } pixDestroy(&pixs); - pixDestroy(&pix1); - pixDestroy(&pix2); - pixDestroy(&pix3); - pixDestroy(&pix4); } /* Generate the pdf. Compute the actual input resolution from @@ -455,7 +441,6 @@ PIXAC *pixac1 = NULL; pixacompConvertToPdf(pixac1, res, 1.0, L_G4_ENCODE, 0, title, fileout); pixacompDestroy(&pixac1); } - return 0; }