From 356ed00377c24e2cec1055be309ceaf11b22203c Mon Sep 17 00:00:00 2001 From: Plamen Dimitrov Date: Tue, 10 Jan 2023 13:43:42 +0200 Subject: [PATCH] Provide recursive region rescanning to detect more text locations The default tesseract operation might recognize entire windows or rectangular regions as a text (word or character) so let's recurse into regions that are larger than a configurable portion of the screen as these are plausibly not a character, a word, or a line. If Tesseract OCR gets confused by large rectangular regions of some more or less uniform background, this could also allow us to improve text detection within each such region without any additional binarization or other text detection pre-processing. --- guibot/finder.py | 65 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/guibot/finder.py b/guibot/finder.py index 5552fae4..411f6e1c 100644 --- a/guibot/finder.py +++ b/guibot/finder.py @@ -1868,6 +1868,8 @@ def __configure_backend(self, backend=None, category="text", reset=False): # 13 different page segmentation modes - see Tesseract API self.params[category]["psmode"] = CVParameter(3, 0, 13, enumerated=True) self.params[category]["extra_configs"] = CVParameter("") + self.params[category]["recursion_height"] = CVParameter(0.3, 0.0, 1.0, 0.01) + self.params[category]["recursion_width"] = CVParameter(0.3, 0.0, 1.0, 0.01) elif backend == "east": # network input dimensions - must be divisible by 32, however currently only # 320x320 doesn't error out from the OpenCV implementation @@ -2272,30 +2274,49 @@ def binarize_step(threshold, text_img): def _detect_text_boxes(self, haystack): import cv2 import numpy - img_haystack = numpy.array(haystack.pil_image) - img_haystack = numpy.array(haystack.pil_image) - self.imglog.hotmaps.append(img_haystack) - - output = self.tbox.run_and_get_output(img_haystack, 'box', - self.params["tdetect"]["language"].value, - config=self.tbox_config) + char_canvas = numpy.array(haystack.pil_image) + text_canvas = numpy.array(haystack.pil_image) + self.imglog.hotmaps.append(char_canvas) + self.imglog.hotmaps.append(text_canvas) text_regions = [] - for line in output.splitlines(): - tokens = line.rstrip().split(" ", maxsplit=6) - if tokens[0] != "WordStr": - continue - left = int(tokens[1]) - bottom = haystack.height - int(tokens[2]) - right = int(tokens[3]) - top = haystack.height - int(tokens[4]) - text = tokens[6][1:] - - x, y, w, h = left, top, right - left, bottom - top - logging.debug("Found text '%s' with tesseract-provided box %s", text, (x, y, w, h)) - cv2.rectangle(img_haystack, (x, y), (x+w, y+h), (0, 0, 0), 2) - cv2.rectangle(img_haystack, (x, y), (x+w, y+h), (0, 255, 0), 1) - text_regions.append([x, y, w, h]) + recursive_regions = [(0, 0, numpy.array(haystack.pil_image))] + while len(recursive_regions) > 0: + offset_x, offset_y, next_region = recursive_regions.pop() + region_w, region_h = next_region.shape[1], next_region.shape[0] + + output = self.tbox.run_and_get_output(next_region, 'box', + self.params["tdetect"]["language"].value, + config=self.tbox_config) + for line in output.splitlines(): + tokens = line.rstrip().split(" ", maxsplit=6) + if tokens[0] != "WordStr": + continue + left = int(tokens[1]) + bottom = region_h - int(tokens[2]) + right = int(tokens[3]) + top = region_h - int(tokens[4]) + text = tokens[6][1:] + + dx, dy, w, h = left, top, right - left, bottom - top + x, y = offset_x + dx, offset_y + dy + if text == "": + logging.debug("Empty text found, skipping region") + continue + recursion_width = self.params["tdetect"]["recursion_width"].value * haystack.width + recursion_height = self.params["tdetect"]["recursion_height"].value * haystack.height + if (w > recursion_width and h > 0) or (h > recursion_height and w > 0): + subregion_npy = next_region[max(dy, 0):min(dy+h, region_h), + max(dx, 0):min(dx+w, region_w)] + if next_region.shape != subregion_npy.shape: + logging.debug("Large region of size %sx%s detected, rescanning inside of it", w, h) + recursive_regions.append((x, y, subregion_npy)) + continue + + logging.debug("Found text '%s' with tesseract-provided box %s", text, (x, y, w, h)) + cv2.rectangle(text_canvas, (x, y), (x+w, y+h), (0, 0, 0), 2) + cv2.rectangle(text_canvas, (x, y), (x+w, y+h), (0, 255, 0), 1) + text_regions.append([x, y, w, h]) return text_regions