Skip to content

Commit

Permalink
Added option to pixCropImage() to extract a page embedded in black image
Browse files Browse the repository at this point in the history
* When there is an oversized media box, the pdftoppm renderer can get
  confused about the image size and generate a larger image that
  is embedded in a black background.
* Added a test to misctest2.c for this functionality.
  • Loading branch information
DanBloomberg committed Oct 8, 2024
1 parent a60bd16 commit c35a72e
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 8 deletions.
Binary file added prog/bad_mediabox_input.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions prog/croppdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
*
* The %edgeclean parameter is used to remove noise that is typically
* near the edges of the image:
* -2: to extract page embedded in black background
* -1: aggressively removes left and right side noise
* 0: default, no removal
* 1-15: removal of random noise, where 15 is maximally aggressive
Expand Down
24 changes: 24 additions & 0 deletions prog/misctest2.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
* * Demonstrate image cleaning function
* * Demonstrat page cropping for 2-column, where one column is
* Kanji, and removing lots of junk on left and right sides.
* * Demonstrate page cropping wiht edgeclean = -2, for a situation
* where a bad oversized mediabox confuses the pdftoppm renderer,
* which embeds the page image in a larger black image.
*/

#ifdef HAVE_CONFIG_H
Expand Down Expand Up @@ -83,6 +86,20 @@ PIXA *pixa1;
"croppdf /tmp/lept/2_column 50 50 -1 70 70 1.12 0"
" none /tmp/lept/misc/2_column_crop_result.pdf");
lept_stderr("Writing /tmp/lept/misc/2_column_crop_result.pdf\n");
callSystemDebug(buf);

/* Page cropping for oversize media box that causes the renderer
* to embed the page in a larger black image. So we need to
* extract the actual page. This is now done with croppdf, using
* edgeclean = -2. The bad scan was encoded with jbig2. It looks
* OK when rendering with evince, but pdftoppm is tripped up by the
* mediabox. See the rendered images at the end of this file. */
lept_mkdir("lept/bad_mediabox");
lept_cp("bad_mediabox_input.pdf", "lept/bad_mediabox", "input.pdf", NULL);
snprintf(buf, sizeof(buf),
"croppdf /tmp/lept/bad_mediabox 50 50 -2 80 80 1.12 0"
" none /tmp/lept/misc/bad_mediabox_crop_result.pdf");
lept_stderr("Writing /tmp/lept/misc/bad_mediabox_crop_result.pdf\n");
callSystemDebug(buf);

/* Page cleaning */
Expand All @@ -108,5 +125,12 @@ PIXA *pixa1;
"/tmp/lept/misc/pageclean.pdf");
pixaDestroy(&pixa1);

/* Input images to bad mediabox example pages; delayed from
* above to give system a chance to generate them. */
snprintf(buf, sizeof(buf), "displaypix /tmp/lept/renderpdf/input-1.ppm");
callSystemDebug(buf);
snprintf(buf, sizeof(buf), "displaypix /tmp/lept/renderpdf/input-2.ppm");
callSystemDebug(buf);

return 0;
}
3 changes: 1 addition & 2 deletions src/allheaders.h
Original file line number Diff line number Diff line change
Expand Up @@ -1377,7 +1377,6 @@ LEPT_DLL extern PIX * pixGenerateHalftoneMask ( PIX *pixs, PIX **ppixtext, l_int
LEPT_DLL extern PIX * pixGenTextlineMask ( PIX *pixs, PIX **ppixvws, l_int32 *ptlfound, PIXA *pixadb );
LEPT_DLL extern PIX * pixGenTextblockMask ( PIX *pixs, PIX *pixvws, PIXA *pixadb );
LEPT_DLL extern PIX * pixCropImage ( PIX *pixs, l_int32 lr_clear, l_int32 tb_clear, l_int32 edgeclean, l_int32 lr_border, l_int32 tb_border, l_float32 maxwiden, l_int32 printwiden, const char *debugfile, BOX **pcropbox );
LEPT_DLL extern l_int32 pixMaxCompAfterVClosing ( PIX *pixs, BOX **pbox );
LEPT_DLL extern PIX * pixCleanImage ( PIX *pixs, l_int32 contrast, l_int32 rotation, l_int32 scale, l_int32 opensize );
LEPT_DLL extern BOX * pixFindPageForeground ( PIX *pixs, l_int32 threshold, l_int32 mindist, l_int32 erasedist, l_int32 showmorph, PIXAC *pixac );
LEPT_DLL extern l_ok pixSplitIntoCharacters ( PIX *pixs, l_int32 minw, l_int32 minh, BOXA **pboxa, PIXA **ppixa, PIX **ppixdebug );
Expand Down Expand Up @@ -1407,7 +1406,7 @@ LEPT_DLL extern l_ok partifyPixac ( PIXAC *pixac, l_int32 nparts, const char *ou
LEPT_DLL extern BOXA * boxaGetWhiteblocks ( BOXA *boxas, BOX *box, l_int32 sortflag, l_int32 maxboxes, l_float32 maxoverlap, l_int32 maxperim, l_float32 fract, l_int32 maxpops );
LEPT_DLL extern BOXA * boxaPruneSortedOnOverlap ( BOXA *boxas, l_float32 maxoverlap );
LEPT_DLL extern l_ok compressFilesToPdf ( SARRAY *sa, l_int32 onebit, l_int32 savecolor, l_float32 scalefactor, l_int32 quality, const char *title, const char *fileout );
LEPT_DLL extern l_ok cropFilesToPdf ( SARRAY *sa, l_int32 lr_clear, l_int32 tb_clear, l_int32 edgeclean, l_int32 lr_add, l_int32 tb_add, l_float32 maxwiden, l_int32 printwiden, const char *title, const char *fileout );
LEPT_DLL extern l_ok cropFilesToPdf ( SARRAY *sa, l_int32 lr_clear, l_int32 tb_clear, l_int32 edgeclean, l_int32 lr_border, l_int32 tb_border, l_float32 maxwiden, l_int32 printwiden, const char *title, const char *fileout );
LEPT_DLL extern l_ok cleanTo1bppFilesToPdf ( SARRAY *sa, l_int32 res, l_int32 contrast, l_int32 rotation, l_int32 opensize, const char *title, const char *fileout );
LEPT_DLL extern l_ok convertFilesToPdf ( const char *dirname, const char *substr, l_int32 res, l_float32 scalefactor, l_int32 type, l_int32 quality, const char *title, const char *fileout );
LEPT_DLL extern l_ok saConvertFilesToPdf ( SARRAY *sa, l_int32 res, l_float32 scalefactor, l_int32 type, l_int32 quality, const char *title, const char *fileout );
Expand Down
81 changes: 76 additions & 5 deletions src/pageseg.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
* Location and extraction of page foreground; cleaning pages
* PIX *pixCropImage()
* static l_int32 pixMaxCompAfterVClosing()
* static l_int32 pixFindPageInsideBlackBorder()
* static PIX *pixRescaleForCropping()
* PIX *pixCleanImage()
* BOX *pixFindPageForeground()
Expand Down Expand Up @@ -94,6 +95,8 @@
static const l_int32 MinWidth = 100;
static const l_int32 MinHeight = 100;

static l_ok pixMaxCompAfterVClosing(PIX *pixs, BOX **pbox);
static l_ok pixFindPageInsideBlackBorder(PIX *pixs, BOX **pbox);
static PIX *pixRescaleForCropping(PIX *pixs, l_int32 w, l_int32 h,
l_int32 lr_border, l_int32 tb_border,
l_float32 maxwiden, PIX **ppixsc);
Expand Down Expand Up @@ -542,6 +545,7 @@ PIX *pix1, *pix2, *pix3, *pixd;
* default = 0 (no removal);
* 15 is maximally aggressive for random noise
* -1 for aggressively removing side noise
* -2 to extract page embedded in black background
* \param[in] lr_border full res final "added" pixels on left and right
* \param[in] tb_border full res final "added" pixels on top and bottom
* \param[in] maxwiden max fractional horizontal stretch allowed
Expand All @@ -558,11 +562,13 @@ PIX *pix1, *pix2, *pix3, *pixd;
* resolution pixels. (This is done at 2x reduction.)
* (c) If %edgeclean > 0, it removes isolated sets of pixels,
* using a close/open operation of size %edgeclean + 1.
* If %edgeclean < 0, it uses a large vertical morphological
* If %edgeclean == -1, it uses a large vertical morphological
* close/open and the extraction of either the largest
* resulting connected component (or the largest two components
* if the page has 2 columns), to eliminate noise on left
* and right sides.
* If %edgeclean == -2, it extracts the page region from a
* possible exterior black surround.
* (d) Find the bounding box of remaining fg pixels and scales
* the box up 2x back to full resolution.
* (e) Crops the binarized image to the bounding box.
Expand Down Expand Up @@ -621,6 +627,10 @@ PIXA *pixa1;
L_WARNING("edgeclean > 15; setting to 15\n", __func__);
edgeclean = 15;
}
if (edgeclean < -1) {
lept_stderr("Using edgeclean = -2\n");
edgeclean = -2;
}
pixGetDimensions(pixs, &w, &h, NULL);
if (w < MinWidth || h < MinHeight) {
L_ERROR("pix too small: w = %d, h = %d\n", __func__, w, h);
Expand Down Expand Up @@ -665,8 +675,10 @@ PIXA *pixa1;
pix3 = pixMorphSequence(pix2, cmd, 0);
ret = pixClipToForeground(pix3, NULL, &box1);
pixDestroy(&pix3);
} else { /* edgeclean < 0) */
} else if (edgeclean == -1) {
ret = pixMaxCompAfterVClosing(pix2, &box1);
} else { /* edgeclean == -2 */
ret = pixFindPageInsideBlackBorder(pix2, &box1);
}
pixDestroy(&pix2);
if (ret) {
Expand Down Expand Up @@ -757,10 +769,10 @@ PIXA *pixa1;
* (4) To work properly with 2-column layout, if the largest and
* second-largest regions are comparable in size, both are included.
* (5) This is used as an option to pixCropImage(), when given
* a negative %edgecrop parameter.
* an %edgecrop parameter of -1.
* </pre>
*/
l_int32
static l_ok
pixMaxCompAfterVClosing(PIX *pixs,
BOX **pbox)
{
Expand All @@ -778,8 +790,10 @@ PIX *pix1;
/* Strong vertical closing */
pix1 = pixMorphSequence(pixs, "r11 + c3.80 + o3.80 + x4", 0);
pixZero(pix1, &empty);
if (empty)
if (empty) {
pixDestroy(&pix1);
return ERROR_INT("pix1 is empty", __func__, 1);
}

/* Find the two c.c. with largest area. If they are not comparable
* in area, return the bounding box of the largest; otherwise,
Expand Down Expand Up @@ -808,6 +822,63 @@ PIX *pix1;
}


/*!
* \brief pixFindPageInsideBlackBorder()
*
* \param[in] pixs 1 bpp (input at 2x reduction)
* \param[out] **pbox page region at input resolution (2x reduction)
* \return 0 if OK, 1 on error
*
* <pre>
* Notes:
* (1) This extracts the page region from the image. It is designed
* to work when the page is within a fairly solid black border.
* (2) It returns a bounding box for the page region at the input res.
* (3) The input %pixs is expected to be at a resolution 100 - 150 ppi.
* (4) This is used as an option to pixCropImage(), when given an
* %edgecrop parameter of -2.
* </pre>
*/
static l_ok
pixFindPageInsideBlackBorder(PIX *pixs,
BOX **pbox)
{
l_int32 empty;
BOX *box1;
BOXA *boxa1, *boxa2;
PIX *pix1, *pix2;

if (!pbox)
return ERROR_INT("pbox not defined", __func__, 1);
*pbox = NULL;
if (!pixs || pixGetDepth(pixs) != 1)
return ERROR_INT("pixs undefined or not 1 bpp", __func__, 1);

/* Reduce 4x and remove some remaining small foreground */
pix1 = pixMorphSequence(pixs, "r22 + c5.5 + o7.7", 0);
pixZero(pix1, &empty);
if (empty) {
pixDestroy(&pix1);
return ERROR_INT("pix1 is empty", __func__, 1);
}

/* Photoinvert image and Find the c.c. with largest area. */
pixInvert(pix1, pix1);
pix2 = pixMorphSequence(pix1, "c11.11 + o11.11", 0);
pixDestroy(&pix1);
boxa1 = pixConnCompBB(pix2, 8);
pixDestroy(&pix2);
boxa2 = boxaSort(boxa1, L_SORT_BY_AREA, L_SORT_DECREASING, NULL);
box1 = boxaGetBox(boxa2, 0, L_COPY); /* largest by area */
boxAdjustSides(box1, box1, 5, -5, 5, -5);
*pbox = boxTransform(box1, 0, 0, 4.0, 4.0);
boxaDestroy(&boxa1);
boxaDestroy(&boxa2);
boxDestroy(&box1);
return 0;
}


/*!
* \brief pixRescaleForCropping()
*
Expand Down
7 changes: 6 additions & 1 deletion src/pdfapp.c
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ PIXAC *pixac1 = NULL;
* default = 0 (no removal);
* 15 is maximally aggressive for random noise
* -1 for aggressively removing side noise
* -2 to extract page embedded in black background
* \param[in] lr_border full res final "added" pixels on left and right
* \param[in] tb_border full res final "added" pixels on top and bottom
* \param[in] maxwiden max fractional horizontal stretch allowed
Expand Down Expand Up @@ -305,11 +306,15 @@ PIXAC *pixac1 = NULL;
pix1 = pixCropImage(pixs, lr_clear, tb_clear, edgeclean,
lr_border, tb_border, maxwiden, printwiden,
NULL, NULL);
pixDestroy(&pixs);
if (!pix1) {
L_ERROR("pix1 not made for i = %d\n", __func__, i);
continue;
}
if (n <= maxsmallset)
pixaAddPix(pixa1, pix1, L_INSERT);
else
pixacompAddPix(pixac1, pix1, IFF_TIFF_G4);
pixDestroy(&pixs);
}

/* Generate the pdf. Compute the actual input resolution from
Expand Down
1 change: 1 addition & 0 deletions version-notes.html
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ <h2 align=center> <IMG SRC="moller52.jpg" border=1 ALIGN_MIDDLE> </h2>
* Added misctest2.c to show crop and clean page functions.
* Added page crop option for removing noise on left and right sides.
* Added page crop option to allow printing to full width of paper.
* Added page crop option to extract page within larger black image.
* Added renderpdf.c to library, to render pdfs as page images with
a specified resolution.
* Modified cleanpdf.c to use l_pdfRenderFiles().
Expand Down

0 comments on commit c35a72e

Please sign in to comment.