Skip to content

Commit

Permalink
Provide recursive region rescanning to detect more text locations
Browse files Browse the repository at this point in the history
The default tesseract operation might recognize entire windows or
rectangular regions as a text (word or character) so let's recurse
into regions that are larger than a configurable portion of the
screen as these are plausibly not a character, a word, or a line.

If Tesseract OCR gets confused by large rectangular regions of
some more or less uniform background, this could also allow us
to improve text detection within each such region without any
additional binarization or other text detection pre-processing.
  • Loading branch information
pevogam committed Apr 29, 2024
1 parent f9981fc commit 356ed00
Showing 1 changed file with 43 additions and 22 deletions.
65 changes: 43 additions & 22 deletions guibot/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1868,6 +1868,8 @@ def __configure_backend(self, backend=None, category="text", reset=False):
# 13 different page segmentation modes - see Tesseract API
self.params[category]["psmode"] = CVParameter(3, 0, 13, enumerated=True)
self.params[category]["extra_configs"] = CVParameter("")
self.params[category]["recursion_height"] = CVParameter(0.3, 0.0, 1.0, 0.01)
self.params[category]["recursion_width"] = CVParameter(0.3, 0.0, 1.0, 0.01)
elif backend == "east":
# network input dimensions - must be divisible by 32, however currently only
# 320x320 doesn't error out from the OpenCV implementation
Expand Down Expand Up @@ -2272,30 +2274,49 @@ def binarize_step(threshold, text_img):
def _detect_text_boxes(self, haystack):
import cv2
import numpy
img_haystack = numpy.array(haystack.pil_image)
img_haystack = numpy.array(haystack.pil_image)
self.imglog.hotmaps.append(img_haystack)

output = self.tbox.run_and_get_output(img_haystack, 'box',
self.params["tdetect"]["language"].value,
config=self.tbox_config)
char_canvas = numpy.array(haystack.pil_image)
text_canvas = numpy.array(haystack.pil_image)
self.imglog.hotmaps.append(char_canvas)
self.imglog.hotmaps.append(text_canvas)

text_regions = []
for line in output.splitlines():
tokens = line.rstrip().split(" ", maxsplit=6)
if tokens[0] != "WordStr":
continue
left = int(tokens[1])
bottom = haystack.height - int(tokens[2])
right = int(tokens[3])
top = haystack.height - int(tokens[4])
text = tokens[6][1:]

x, y, w, h = left, top, right - left, bottom - top
logging.debug("Found text '%s' with tesseract-provided box %s", text, (x, y, w, h))
cv2.rectangle(img_haystack, (x, y), (x+w, y+h), (0, 0, 0), 2)
cv2.rectangle(img_haystack, (x, y), (x+w, y+h), (0, 255, 0), 1)
text_regions.append([x, y, w, h])
recursive_regions = [(0, 0, numpy.array(haystack.pil_image))]
while len(recursive_regions) > 0:
offset_x, offset_y, next_region = recursive_regions.pop()
region_w, region_h = next_region.shape[1], next_region.shape[0]

output = self.tbox.run_and_get_output(next_region, 'box',
self.params["tdetect"]["language"].value,
config=self.tbox_config)
for line in output.splitlines():
tokens = line.rstrip().split(" ", maxsplit=6)
if tokens[0] != "WordStr":
continue
left = int(tokens[1])
bottom = region_h - int(tokens[2])
right = int(tokens[3])
top = region_h - int(tokens[4])
text = tokens[6][1:]

dx, dy, w, h = left, top, right - left, bottom - top
x, y = offset_x + dx, offset_y + dy
if text == "":
logging.debug("Empty text found, skipping region")
continue
recursion_width = self.params["tdetect"]["recursion_width"].value * haystack.width
recursion_height = self.params["tdetect"]["recursion_height"].value * haystack.height
if (w > recursion_width and h > 0) or (h > recursion_height and w > 0):
subregion_npy = next_region[max(dy, 0):min(dy+h, region_h),
max(dx, 0):min(dx+w, region_w)]
if next_region.shape != subregion_npy.shape:
logging.debug("Large region of size %sx%s detected, rescanning inside of it", w, h)
recursive_regions.append((x, y, subregion_npy))
continue

logging.debug("Found text '%s' with tesseract-provided box %s", text, (x, y, w, h))
cv2.rectangle(text_canvas, (x, y), (x+w, y+h), (0, 0, 0), 2)
cv2.rectangle(text_canvas, (x, y), (x+w, y+h), (0, 255, 0), 1)
text_regions.append([x, y, w, h])

return text_regions

Expand Down

0 comments on commit 356ed00

Please sign in to comment.