diff --git a/.travis.yml b/.travis.yml index 0f01cf15..79877d58 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: - 3.8 - pypy3.5 env: -- INSTALL_VARIANT=pip DISABLE_OCR=1 DISABLE_PYTORCH=1 +- INSTALL_VARIANT=pip DISABLE_PYTORCH=1 matrix: include: - python: 3.7 @@ -33,6 +33,10 @@ addons: sources: - ubuntu-toolchain-r-test packages: + # ocr + - pkg-config + - tesseract-ocr + - libtesseract-dev # virtual screen - libx11-dev - libxtst-dev @@ -51,6 +55,7 @@ before_script: - "/sbin/start-stop-daemon --start --quiet --pidfile /tmp/custom_xvfb_99.pid --make-pidfile --background --exec /usr/bin/Xvfb -- :99 -ac -screen 0 1024x768x24" - sleep 3 # give xvfb some time to start +- if [[ $TRAVIS_PYTHON_VERSION == '3.5' ]]; then export DISABLE_OCR=1; fi - if [[ $TRAVIS_PYTHON_VERSION == 'pypy3.5' ]]; then export DISABLE_AUTOPY=1; fi - if [[ $TRAVIS_PYTHON_VERSION == 'pypy3.5' ]]; then export DISABLE_PYQT=1; fi - if [[ $TRAVIS_PYTHON_VERSION == 'pypy3.5' ]]; then export DISABLE_OPENCV=1; fi diff --git a/guibot/calibrator.py b/guibot/calibrator.py index 2482913c..32154dce 100644 --- a/guibot/calibrator.py +++ b/guibot/calibrator.py @@ -26,6 +26,12 @@ log = logging.getLogger('guibot.calibrator') +#: explicit blacklist of backend combinations to skip for benchmarking +benchmark_blacklist = [("mixed", "normal", "mixed", "east", "hmm", "adaptive", "adaptive"), + ("mixed", "adaptive", "mixed", "east", "hmm", "adaptive", "adaptive"), + ("mixed", "canny", "mixed", "east", "hmm", "adaptive", "adaptive")] + + class Calibrator(object): """ Provides with a group of methods to facilitate and automate the selection @@ -112,12 +118,19 @@ def backend_tuples(category_list, finder): for z in backend_tuples(category_list[1:], finder): yield (backend,) + z for backend_tuple in backend_tuples(ordered_categories, finder): + if backend_tuple in benchmark_blacklist: + log.warning("Skipping blacklisted benchmarked backend combination") + continue method = "+".join(backend_tuple) log.info("Benchmark testing with %s", method) for backend, category in zip(backend_tuple, ordered_categories): finder.configure_backend(backend=backend, category=category, reset=False) finder.can_calibrate(category, calibration) + try: + finder.synchronize_backend(backend=backend, category=category, reset=False) + except UnsupportedBackendError as error: + log.debug("Skipping synchronization for %s/backend=%s", category, backend) if random_starts > 0: self.search(finder, random_starts=random_starts, uniform=uniform, @@ -489,6 +502,10 @@ def _handle_restricted_values(self, finder): params["blurKernelSize"].value += 1 if params["backend"] == "adaptive" and params["blockSize"].value % 2 == 0: params["blockSize"].value += 1 + if "tdetect" in finder.params: + params = finder.params["tdetect"] + if params["backend"] == "east" and params["input_res_x"].value != params["input_res_y"].value: + params["input_res_x"].value = params["input_res_y"].value if "ocr" in finder.params: params = finder.params["ocr"] if params["dt_mask_size"].value not in [0, 3, 5]: diff --git a/guibot/config.py b/guibot/config.py index 57e0ffb8..93332985 100644 --- a/guibot/config.py +++ b/guibot/config.py @@ -59,8 +59,8 @@ class GlobalConfig(type): _feature_detect_backend = "ORB" _feature_extract_backend = "ORB" _feature_match_backend = "BruteForce-Hamming" - _text_detect_backend = "erstat" - _text_ocr_backend = "tesseract" + _text_detect_backend = "contours" + _text_ocr_backend = "pytesseract" _hybrid_match_backend = "template" def toggle_delay(self, value=None): @@ -348,8 +348,8 @@ def find_backend(self, value=None): * feature - matching using a mixture of feature detection, extraction and matching algorithms * cascade - matching using OpenCV pretrained Haar cascades - * text - text matching using ERStat or custom text detection, - followed by tesseract or Hidden Markov Model OCR + * text - text matching using EAST, ERStat, or custom text detection, + followed by Tesseract or Hidden Markov Model OCR * tempfeat - a mixture of template and feature matching where the first is used as necessary and the second as sufficient stage * deep - deep learning matching using convolutional neural network but @@ -450,7 +450,7 @@ def text_detect_backend(self, value=None): :param value: name of the text detection backend - Supported backends: erstat, contours, components. + Supported backends: east, erstat, contours, components. """ if value is None: return GlobalConfig._text_detect_backend @@ -465,7 +465,7 @@ def text_ocr_backend(self, value=None): :param value: name of the optical character recognition backend - Supported backends: tesseract, hmm, beamSearch. + Supported backends: pytesseract, tesserocr, tesseract (OpenCV), hmm, beamSearch. """ if value is None: return GlobalConfig._text_ocr_backend diff --git a/guibot/finder.py b/guibot/finder.py index 7a26139b..6973a88f 100644 --- a/guibot/finder.py +++ b/guibot/finder.py @@ -19,6 +19,7 @@ import copy import random import configparser as config +import PIL.Image from .config import GlobalConfig, LocalConfig from .imagelogger import ImageLogger @@ -368,6 +369,8 @@ def can_calibrate(self, category, mark): value.fixed = True elif category == "fdetect" and key == "Extended": value.fixed = True + elif category == "tdetect" and key in ["input_res_x", "input_res_y"]: + value.fixed = True else: value.fixed = not mark log.debug("Setting %s/%s to fixed=%s for calibration", category, key, value.fixed) @@ -1769,8 +1772,8 @@ def __init__(self, configure=True, synchronize=True): self.categories["threshold2"] = "threshold_filters2" self.categories["threshold3"] = "threshold_filters3" self.algorithms["text_matchers"] = ("mixed",) - self.algorithms["text_detectors"] = ("erstat", "contours", "components") - self.algorithms["text_recognizers"] = ("tesseract", "hmm", "beamSearch") + self.algorithms["text_detectors"] = ("east", "erstat", "contours", "components") + self.algorithms["text_recognizers"] = ("pytesseract", "tesserocr", "tesseract", "hmm", "beamSearch") self.algorithms["threshold_filters2"] = tuple(self.algorithms["threshold_filters"]) self.algorithms["threshold_filters3"] = tuple(self.algorithms["threshold_filters"]) @@ -1828,7 +1831,13 @@ def __configure_backend(self, backend=None, category="text", reset=False): if category == "text": self.params[category]["datapath"] = CVParameter("../misc") elif category == "tdetect": - if backend == "erstat": + if backend == "east": + # network input dimensions - must be divisible by 32, however currently only + # 320x320 doesn't error out from the OpenCV implementation + self.params[category]["input_res_x"] = CVParameter(320, 32, None, 32.0) + self.params[category]["input_res_y"] = CVParameter(320, 32, None, 32.0) + self.params[category]["min_box_confidence"] = CVParameter(0.8, 0.0, 1.0, 0.1) + elif backend == "erstat": self.params[category]["thresholdDelta"] = CVParameter(1, 1, 255, 50.0) self.params[category]["minArea"] = CVParameter(0.00025, 0.0, 1.0, 0.25, 0.001) self.params[category]["maxArea"] = CVParameter(0.13, 0.0, 1.0, 0.25, 0.001) @@ -1843,7 +1852,7 @@ def __configure_backend(self, backend=None, category="text", reset=False): self.params[category]["minHeight"] = CVParameter(1, 0, None, 100.0) self.params[category]["maxHeight"] = CVParameter(100, 0, None, 100.0) self.params[category]["minAspectRatio"] = CVParameter(0.1, 0.0, None, 10.0) - self.params[category]["maxAspectRatio"] = CVParameter(1.5, 0.0, None, 10.0) + self.params[category]["maxAspectRatio"] = CVParameter(2.5, 0.0, None, 10.0) self.params[category]["horizontalSpacing"] = CVParameter(10, 0, None, 10.0) self.params[category]["verticalVariance"] = CVParameter(10, 0, None, 10.0) # 0 horizontal, 1 vertical @@ -1854,17 +1863,23 @@ def __configure_backend(self, backend=None, category="text", reset=False): # allowed and no intermediary values between 4 and 8 will be selected self.params[category]["connectivity"] = CVParameter(4, 4, 8, 4.0, 4.0) elif category == "ocr": - if backend == "tesseract": + if backend in ["tesseract", "tesserocr", "pytesseract"]: # eng, deu, etc. (ISO 639-3) self.params[category]["language"] = CVParameter("eng") - self.params[category]["char_whitelist"] = CVParameter("0123456789abcdefghijklmnopqrst" + self.params[category]["char_whitelist"] = CVParameter(" 0123456789abcdefghijklmnopqrst" "uvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") # 0 original tesseract only, 1 neural nets LSTM only, 2 both, 3 anything available self.params[category]["oem"] = CVParameter(3, 0, 3, enumerated=True) # 13 different page segmentation modes - see Tesseract API self.params[category]["psmode"] = CVParameter(3, 0, 13, enumerated=True) - # 0 OCR_LEVEL_WORD, 1 OCR_LEVEL_TEXT_LINE - self.params[category]["component_level"] = CVParameter(1, 0, 1, enumerated=True) + if backend == "pytesseract": + self.params[category]["extra_configs"] = CVParameter("") + elif backend == "tesserocr": + # TODO: there could be a decent way to change component modes + self.params[category]["component_level"] = CVParameter(1, 1, 1, enumerated=True) + else: + # 0 OCR_LEVEL_WORD, 1 OCR_LEVEL_TEXT_LINE + self.params[category]["component_level"] = CVParameter(1, 0, 1, enumerated=True) # perform custom image thresholding if set to true or leave it to the OCR self.params[category]["binarize_text"] = CVParameter(False) elif backend == "hmm": @@ -1950,6 +1965,8 @@ def __synchronize_backend(self, backend=None, category="text", reset=False): # nothing to sync return + elif category == "tdetect" and backend == "east": + self.east_net = cv2.dnn.readNet(os.path.join(datapath, 'frozen_east_text_detection.pb')) elif category == "tdetect" and backend == "erstat": self.erc1 = cv2.text.loadClassifierNM1(os.path.join(datapath, 'trained_classifierNM1.xml')) self.erf1 = cv2.text.createERFilterNM1(self.erc1, @@ -1966,7 +1983,24 @@ def __synchronize_backend(self, backend=None, category="text", reset=False): return elif category == "ocr": - if backend == "tesseract": + + if backend == "pytesseract": + import pytesseract + self.ocr = pytesseract + self.ocr_config = r"--tessdata-dir %s --oem %s --psm %s " + self.ocr_config %= (os.path.join(datapath, "tessdata"), + self.params["ocr"]["oem"].value, + self.params["ocr"]["psmode"].value) + self.ocr_config += r"-c tessedit_char_whitelist='%s' %s" + self.ocr_config %= (self.params["ocr"]["char_whitelist"].value, + self.params["ocr"]["extra_configs"].value) + elif backend == "tesserocr": + from tesserocr import PyTessBaseAPI + self.ocr = PyTessBaseAPI(lang=self.params["ocr"]["language"].value, + oem=self.params["ocr"]["oem"].value, + psm=self.params["ocr"]["psmode"].value) + self.ocr.SetVariable("tessedit_char_whitelist", self.params["ocr"]["char_whitelist"].value) + elif backend == "tesseract": self.ocr = cv2.text.OCRTesseract_create(os.path.join(datapath, "tessdata"), language=self.params["ocr"]["language"].value, char_whitelist=self.params["ocr"]["char_whitelist"].value, @@ -2064,7 +2098,10 @@ def find(self, needle, haystack): # detect characters and group them into detected text backend = self.params["tdetect"]["backend"] - if backend == "erstat": + log.debug("Detecting text with %s", backend) + if backend == "east": + text_regions = self._detect_text_east(haystack) + elif backend == "erstat": text_regions = self._detect_text_erstat(haystack) elif backend == "contours": text_regions = self._detect_text_contours(haystack) @@ -2074,6 +2111,8 @@ def find(self, needle, haystack): raise UnsupportedBackendError("Unsupported text detection backend %s" % backend) # perform optical character recognition on the final regions + backend = self.params["ocr"]["backend"] + log.debug("Recognizing text with %s", backend) from .match import Match matches = [] def binarize_step(threshold, text_img): @@ -2114,35 +2153,47 @@ def binarize_step(threshold, text_img): self.imglog.hotmaps.append(text_img) # BUG: we hit segfault when using the BeamSearch OCR backend so disallow it - if self.params["ocr"]["backend"] == "beamSearch": + if backend == "beamSearch": raise NotImplementedError("Current version of BeamSearch segfaults so it's not yet available") - # TODO: can't do this in python - available ony in C++ + # TODO: we can do this now with pytesseract/tesserocr but have to evaluate its usefulness #vector boxes; #vector words; #vector confidences; #output = ocr.run(group_img, &boxes, &words, &confidences, cv2.text.OCR_LEVEL_WORD) # redirection of tesseract's streams can only be done on the file descriptor level # sys.stdout = open(os.devnull, 'w') - stdout_fd = sys.stdout.fileno() if hasattr(sys.stdout, "fileno") else 1 - stderr_fd = sys.stderr.fileno() if hasattr(sys.stderr, "fileno") else 2 - null_fo = open(os.devnull, 'wb') - with os.fdopen(os.dup(stdout_fd), 'wb') as cpout_fo: - with os.fdopen(os.dup(stderr_fd), 'wb') as cperr_fo: - sys.stdout.flush() - sys.stderr.flush() - os.dup2(null_fo.fileno(), stdout_fd) - os.dup2(null_fo.fileno(), stderr_fd) - output = self.ocr.run(text_img, text_img, - self.params["ocr"]["min_confidence"].value, - self.params["ocr"]["component_level"].value) - sys.stdout.flush() - sys.stderr.flush() - os.dup2(cpout_fo.fileno(), stdout_fd) - os.dup2(cperr_fo.fileno(), stderr_fd) - null_fo.close() - if self.params["ocr"]["component_level"].value == 1: - # strip of the new line character which is never useful - output = output.rstrip() + if backend == "pytesseract": + output = self.ocr.image_to_string(text_img, + lang=self.params["ocr"]["language"].value, + config=self.ocr_config) + logging.debug("Running pytesseract with extra command line %s", self.ocr_config) + elif backend == "tesserocr": + self.ocr.SetImage(PIL.Image.fromarray(text_img)) + output = self.ocr.GetUTF8Text() + if self.params["ocr"]["component_level"].value == 1: + # strip of the new line character which is never useful + output = output.rstrip() + else: + stdout_fd = sys.stdout.fileno() if hasattr(sys.stdout, "fileno") else 1 + stderr_fd = sys.stderr.fileno() if hasattr(sys.stderr, "fileno") else 2 + null_fo = open(os.devnull, 'wb') + with os.fdopen(os.dup(stdout_fd), 'wb') as cpout_fo: + with os.fdopen(os.dup(stderr_fd), 'wb') as cperr_fo: + sys.stdout.flush() + sys.stderr.flush() + os.dup2(null_fo.fileno(), stdout_fd) + os.dup2(null_fo.fileno(), stderr_fd) + output = self.ocr.run(text_img, text_img, + self.params["ocr"]["min_confidence"].value, + self.params["ocr"]["component_level"].value) + sys.stdout.flush() + sys.stderr.flush() + os.dup2(cpout_fo.fileno(), stdout_fd) + os.dup2(cperr_fo.fileno(), stderr_fd) + null_fo.close() + if self.params["ocr"]["component_level"].value == 1: + # strip of the new line character which is never useful + output = output.rstrip() log.debug("OCR output %s = '%s'", i+1, output) similarity = 1.0 - float(needle.distance_to(output)) / max(len(output), len(text_needle)) @@ -2162,6 +2213,94 @@ def binarize_step(threshold, text_img): self.imglog.log(30) return matches + def _detect_text_east(self, haystack): + #:.. note:: source implementation by Adrian Rosebrock from his post: + #: https://www.pyimagesearch.com/2018/08/20/opencv-text-detection-east-text-detector/ + import cv2 + import numpy + img = numpy.array(haystack.pil_image) + char_canvas = cv2.cvtColor(numpy.array(haystack.pil_image), cv2.COLOR_RGB2GRAY) + text_canvas = numpy.array(haystack.pil_image) + self.imglog.hotmaps.append(char_canvas) + self.imglog.hotmaps.append(text_canvas) + + # resize the image to resolution compatible with the model + inp_width, inp_height = (self.params["tdetect"]["input_res_x"].value, + self.params["tdetect"]["input_res_y"].value) + width_ratio = img.shape[1] / float(inp_width) + height_ratio = img.shape[0] / float(inp_height) + img = cv2.resize(img, (inp_width, inp_height)) + + # convert to a model-compatible input using the mean from the training + inp = cv2.dnn.blobFromImage(img, mean=(123.68, 116.78, 103.94), swapRB=True, crop=False) + self.east_net.setInput(inp) + + # select two output layers for the EAST detector model respectivelly for + # the output probabilities and the text bounding box coordinates + output_layers = ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"] + probability, geometry = self.east_net.forward(output_layers) + char_canvas[:] = cv2.resize(probability[0,0]*255.0, (char_canvas.shape[1], char_canvas.shape[0])) + + rects = [] + for row in range(0, probability.shape[2]): + row_scores = probability[0, 0, row] + row_data = geometry[0, :, row] + for col in range(0, probability.shape[3]): + # prune out subthreshold probability of being a text + if row_scores[col] < self.params["tdetect"]["min_box_confidence"].value: + continue + # use geometry data to get input size and rescale for final bounding box width and height + h = min(row_data[0][col] + row_data[2][col], inp_height) * height_ratio + w = min(row_data[1][col] + row_data[3][col], inp_width) * width_ratio + # output layer dimensions are 4x smaller than the input layer dimentions + (dx, dy) = (col + 1) * 4.0, (row + 1) * 4.0 + # calculate the rotation angle from the prediction ouput + sin, cos = numpy.sin(row_data[4][col]), numpy.cos(row_data[4][col]) + # compute the starting (from ending) coordinates for the text bounding box + x2 = min(dx + cos * row_data[1][col] + sin * row_data[2][col], inp_width) * width_ratio + y2 = min(dy - sin * row_data[1][col] + cos * row_data[2][col], inp_height) * height_ratio + # the network might give unlimited region boundaries so limit by input width/height (above) + x1, y1 = x2 - w, y2 - h + + rect = (int(x1), int(y1), int(w), int(h)) + cv2.rectangle(char_canvas, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (0, 0, 0), 2) + cv2.rectangle(char_canvas, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (255, 255, 255), 1) + rects.append(rect) + # TODO: needed for outsourced nonmaxima supression + # confidences.append(row_scores[x]) + + logging.debug("A total of %s possible text regions found", len(rects)) + + # produce a final set of nonintersecting text regions + text_regions = [] + # TODO: apply outsourced nonmaxima suppression as the current OpenCV + # implementation is broken in the number of python2C++ called arguments + # indices = cv2.dnn.NMSBoxesRotated(rects, confidences, 0.5, 0.5, 1., 0) + region_queue = [[region, True] for region in rects] + while True: + # nothing to do for just one region + if len(region_queue) < 2: + break + r1, flag1 = region_queue.pop(0) + if not flag1: + continue + for r2pair in region_queue: + r2, _ = r2pair + # if the two regions intersect + if (r1[0] < r2[0] + r2[2] and r1[0] + r1[2] > r2[0] and + r1[1] < r2[1] + r2[3] and r1[1] + r1[3] > r2[1]): + r1 = [min(r1[0], r2[0]), min(r1[1], r2[1]), max(r1[2], r2[2]), max(r1[3], r2[3])] + # second region will no longer be considered + r2pair[1] = False + # first region is now merged with all intersecting regions + text_regions.append(r1) + for rect in text_regions: + cv2.rectangle(text_canvas, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (0, 0, 0), 2) + cv2.rectangle(text_canvas, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (0, 0, 255), 1) + + logging.debug("A total of %s final text regions found", len(text_regions)) + return text_regions + def _detect_text_erstat(self, haystack): import cv2 import numpy diff --git a/misc/frozen_east_text_detection.pb b/misc/frozen_east_text_detection.pb new file mode 100644 index 00000000..5702180c Binary files /dev/null and b/misc/frozen_east_text_detection.pb differ diff --git a/packaging/packager_deb.sh b/packaging/packager_deb.sh index 83e3b914..154eeab2 100644 --- a/packaging/packager_deb.sh +++ b/packaging/packager_deb.sh @@ -11,6 +11,8 @@ apt-get update apt-get -y install python3 python3-coverage # python-imaging apt-get -y install python3-pil +# pip dependencies (for dependencies not available as DEB) +apt-get -y install gcc libx11-dev libxtst-dev python3-dev libpng-dev python3-pip # contour, template, feature, cascade, text matching apt-get -y install python3-numpy if [[ $distro_version == "xenial" ]]; then @@ -19,22 +21,22 @@ else apt-get -y install python3-opencv fi # text matching -if [[ $distro_version == "focal" ]]; then - # TODO: OpenCV's OCR API for Tesseract 4.1+ is broken +if [[ $distro_version == "xenial" ]]; then export DISABLE_OCR=1 +else + apt-get -y install tesseract-ocr libtesseract-dev + apt-get -y install g++ pkg-config + pip3 install pytesseract==0.3.4 tesserocr==2.5.1 fi -apt-get -y install tesseract-ocr -# desktop control -apt-get -y install xdotool x11-apps imagemagick -apt-get -y install x11vnc - -# pip dependencies (not available as DEB) -apt-get -y install gcc libx11-dev libxtst-dev python3-dev libpng-dev python3-pip -pip3 install autopy==4.0.0 +# deep learning pip3 install torch==1.4.0 torchvision==0.5.0 +# screen controlling +pip3 install autopy==4.0.0 pip3 install vncdotool==0.12.0 +apt-get -y install xdotool x11-apps imagemagick +apt-get -y install x11vnc -# deb packaging +# deb packaging and installing of current guibot source apt-get -y install dh-make dh-python debhelper python3-all devscripts ROOT="" NAME=$(sed -n 's/^Package:[ \t]*//p' "$ROOT/guibot/packaging/debian/control") diff --git a/packaging/packager_rpm.sh b/packaging/packager_rpm.sh index 8f966670..58a1e3e3 100644 --- a/packaging/packager_rpm.sh +++ b/packaging/packager_rpm.sh @@ -9,23 +9,23 @@ readonly distro_version="${VERSION:-30}" dnf -y install python3 python3-coverage # python-imaging dnf -y install python3-pillow +# pip dependencies (for dependencies not available as RPM) +dnf -y install gcc libX11-devel libXtst-devel python3-devel libpng-devel python3-pip redhat-rpm-config # contour, template, feature, cascade, text matching dnf -y install python3-numpy python3-opencv # text matching -# TODO: current cv2.text module is either missing or compatible with Tesseract 3 (we use 4) -export DISABLE_OCR=1 -dnf -y install tesseract -# desktop control -dnf -y install xdotool xwd ImageMagick -dnf -y install x11vnc - -# pip dependencies (not available as RPM) -dnf -y install gcc libX11-devel libXtst-devel python3-devel libpng-devel python3-pip redhat-rpm-config -pip3 install autopy==4.0.0 +dnf -y install tesseract tesseract-devel +dnf -y install gcc-c++ +pip3 install pytesseract==0.3.4 tesserocr==2.5.1 +# deep learning pip3 install torch==1.4.0 torchvision==0.5.0 +# screen controlling +pip3 install autopy==4.0.0 pip3 install vncdotool==0.12.0 +dnf -y install xdotool xwd ImageMagick +dnf -y install x11vnc -# rpm packaging +# rpm packaging and installing of current guibot source dnf -y install rpm-build ROOT="" NAME=$(sed -n 's/^Name:[ \t]*//p' "$ROOT/guibot/packaging/guibot.spec") diff --git a/packaging/pip_requirements.txt b/packaging/pip_requirements.txt index 2238ccb9..814b81d4 100644 --- a/packaging/pip_requirements.txt +++ b/packaging/pip_requirements.txt @@ -5,6 +5,8 @@ Pillow==7.2.0 # backends autopy==4.0.0; platform_python_implementation != "PyPy" +pytesseract==0.3.4 +tesserocr==2.5.1 opencv-contrib-python==4.2.0.34; platform_python_implementation != "PyPy" # TODO: travis errors out with > 4MB log size only from torch download torch==1.4.0; 'generic' not in platform_release and platform_python_implementation != "PyPy" diff --git a/tests/test_calibrator.py b/tests/test_calibrator.py index cf023102..5200f3f8 100755 --- a/tests/test_calibrator.py +++ b/tests/test_calibrator.py @@ -208,8 +208,15 @@ def test_benchmark_text(self): # also get rid of these since they are not implemented anyway finder.algorithms["text_detectors"] = list(finder.algorithms["text_detectors"]) finder.algorithms["text_detectors"].remove("components") + import cv2 + # TODO: deprecate OpenCV 3.X versions after time + if cv2.__version__.startswith("3."): + finder.algorithms["text_detectors"].remove("east") finder.algorithms["text_recognizers"] = list(finder.algorithms["text_recognizers"]) finder.algorithms["text_recognizers"].remove("beamSearch") + # one tesseract backend is enough for the unit test + finder.algorithms["text_recognizers"].remove("tesseract") + finder.algorithms["text_recognizers"].remove("pytesseract") results = calibrator.benchmark(finder, calibration=calibration, random_starts=random_starts) # pprint.pprint(results) self.assertGreater(len(results), 0, "There should be at least one benchmarked method") diff --git a/tests/test_finder.py b/tests/test_finder.py index f80ee9cf..49d2f3f0 100644 --- a/tests/test_finder.py +++ b/tests/test_finder.py @@ -575,11 +575,21 @@ def test_text_same(self): # TODO: this is still not implemented if ocr == "beamSearch": continue + # TODO: handle newer OpenCV bugs with some backends + import cv2 + # TODO: OpenCV 4.2.0 Tesseract bindings output nothing + if cv2.__version__ == "4.2.0" and ocr == "tesseract": + continue + # TODO: deprecate OpenCV 3.X versions after time + elif cv2.__version__.startswith("3.") and tdetect == "east": + continue # HMM misinterprets one char leading to 3/4 recognized chars # Tesseract still has similarity 1.0 though if ocr == "hmm": finder.params["find"]["similarity"].value = 0.75 + if tdetect == "east": + finder.params["find"]["similarity"].value = 0.4 else: finder.params["find"]["similarity"].value = 1.0 @@ -592,10 +602,13 @@ def test_text_same(self): # verify match accuracy self.assertEqual(len(matches), 1) - self.assertEqual(matches[0].x, 22) - self.assertEqual(matches[0].y, 83) - self.assertAlmostEqual(matches[0].width, 40, delta=3) - self.assertAlmostEqual(matches[0].height, 15, delta=3) + # the EAST network confuses the space among some squares with + # text and thus still read the output but in a larger rectangle + if tdetect != "east": + self.assertEqual(matches[0].x, 22) + self.assertEqual(matches[0].y, 83) + self.assertAlmostEqual(matches[0].width, 40, delta=3) + self.assertAlmostEqual(matches[0].height, 15, delta=3) # verify dumped files count and names dumps = self._verify_and_get_dumps(7, i) @@ -635,6 +648,12 @@ def test_text_nomatch(self): # TODO: this is still not implemented if ocr == "beamSearch": continue + # TODO: handle newer OpenCV bugs with some backends + import cv2 + # TODO: deprecate OpenCV 3.X versions after time + if cv2.__version__.startswith("3.") and tdetect == "east": + continue + finder.configure_backend(tdetect, "tdetect") finder.configure_backend(ocr, "ocr") # also with customized synchronization to the configuration @@ -672,13 +691,12 @@ def test_text_nomatch(self): "Disabled OpenCV or OCR") def test_text_basic(self): finder = TextFinder() - finder.params["find"]["similarity"].value = 0.7 matches = finder.find(Text('Find the word here'), Image('sentence_sans')) self.assertEqual(len(matches), 1) # TODO: location too far due to poor text detection #self.assertEqual(matches[0].x, 11) self.assertEqual(matches[0].y, 12) - self.assertAlmostEqual(matches[0].width, 110, delta=5) + self.assertAlmostEqual(matches[0].width, 115, delta=5) self.assertAlmostEqual(matches[0].height, 10, delta=5) @unittest.skipIf(os.environ.get('DISABLE_OPENCV', "0") == "1" or @@ -686,7 +704,6 @@ def test_text_basic(self): "Disabled OpenCV or OCR") def test_text_bold(self): finder = TextFinder() - finder.params["find"]["similarity"].value = 0.8 matches = finder.find(Text('Find the word'), Image('sentence_bold')) self.assertEqual(len(matches), 1) self.assertEqual(matches[0].x, 12) @@ -699,7 +716,6 @@ def test_text_bold(self): "Disabled OpenCV or OCR") def test_text_italic(self): finder = TextFinder() - finder.params["find"]["similarity"].value = 0.7 matches = finder.find(Text('Find the word here'), Image('sentence_italic')) self.assertEqual(len(matches), 1) self.assertEqual(matches[0].x, 11) @@ -712,8 +728,6 @@ def test_text_italic(self): "Disabled OpenCV or OCR") def test_text_larger(self): finder = TextFinder() - # TODO: this is too low to be a match (due to text detection) - finder.params["find"]["similarity"].value = 0.4 matches = finder.find(Text('Find the word'), Image('sentence_larger')) self.assertEqual(len(matches), 1) # TODO: location too far due to poor text detection @@ -727,8 +741,6 @@ def test_text_larger(self): "Disabled OpenCV or OCR") def test_text_font(self): finder = TextFinder() - # TODO: this is too low to be a match - finder.params["find"]["similarity"].value = 0.3 matches = finder.find(Text('Find the word here'), Image('sentence_font')) self.assertEqual(len(matches), 1) self.assertEqual(matches[0].x, 7)