Skip to content

Commit

Permalink
Drop requirement on using tessdata environment variable
Browse files Browse the repository at this point in the history
As we now require additional tessdata than just the trained languages
and guibot's misc data path is not included in all forms of packaging
it is better to drop any tessdata distributed within guibot and thus
any requirement on explicitly providing a (tess)data path in the form
of CV parameter or an environment variable. These would still take
increasing order of precedence if set though to support any previous
customization.
  • Loading branch information
pevogam committed Apr 29, 2024
1 parent 356ed00 commit b27f315
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 22 deletions.
48 changes: 27 additions & 21 deletions guibot/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2002,19 +2002,22 @@ def __synchronize_backend(self, backend=None, category="text", reset=False):

import cv2
datapath = self.params["text"]["datapath"].value
tessdata_path = os.path.join(datapath, "tessdata")
if not os.path.exists(tessdata_path):
tessdata_path = os.environ.get("TESSDATA_PREFIX", "./tessdata")
if not os.path.exists(tessdata_path):
tessdata_path = None

if category == "text" or category in ["contour", "threshold", "threshold2"]:
# nothing to sync
return

elif category == "tdetect" and backend == "pytesseract":
tessdata_path = os.path.join(datapath, "tessdata")
if not os.path.exists(tessdata_path):
tessdata_path = os.environ.get("TESSDATA_PREFIX", ".")

import pytesseract
self.tbox = pytesseract
self.tbox_config = r"--tessdata-dir %s --oem %s --psm %s "
self.tbox_config %= (tessdata_path,
tessdata_dir = "--tessdata-dir '" + tessdata_path + "'" if tessdata_path else ""
self.tbox_config = r"%s --oem %s --psm %s "
self.tbox_config %= (tessdata_dir,
self.params["tdetect"]["oem"].value,
self.params["tdetect"]["psmode"].value)
self.tbox_config += r"-c tessedit_char_whitelist='%s' %s batch.nochop wordstrbox"
Expand All @@ -2038,33 +2041,36 @@ def __synchronize_backend(self, backend=None, category="text", reset=False):
return

elif category == "ocr":
tessdata_path = os.path.join(datapath, "tessdata")
if not os.path.exists(tessdata_path):
tessdata_path = os.environ.get("TESSDATA_PREFIX", ".")

if backend == "pytesseract":
import pytesseract
self.ocr = pytesseract
self.ocr_config = r"--tessdata-dir '%s' --oem %s --psm %s "
self.ocr_config %= (tessdata_path,
tessdata_dir = "--tessdata-dir '" + tessdata_path + "'" if tessdata_path else ""
self.ocr_config = r"%s --oem %s --psm %s "
self.ocr_config %= (tessdata_dir,
self.params["ocr"]["oem"].value,
self.params["ocr"]["psmode"].value)
self.ocr_config += r"-c tessedit_char_whitelist='%s' %s"
self.ocr_config %= (self.params["ocr"]["char_whitelist"].value,
self.params["ocr"]["extra_configs"].value)
elif backend == "tesserocr":
from tesserocr import PyTessBaseAPI
self.ocr = PyTessBaseAPI(path=tessdata_path,
lang=self.params["ocr"]["language"].value,
oem=self.params["ocr"]["oem"].value,
psm=self.params["ocr"]["psmode"].value)
kwargs = {"lang": self.params["ocr"]["language"].value,
"oem": self.params["ocr"]["oem"].value,
"psm": self.params["ocr"]["psmode"].value}
if tessdata_path:
self.ocr = PyTessBaseAPI(path=tessdata_path, **kwargs)
else:
self.ocr = PyTessBaseAPI(**kwargs)
self.ocr.SetVariable("tessedit_char_whitelist", self.params["ocr"]["char_whitelist"].value)
elif backend == "tesseract":
self.ocr = cv2.text.OCRTesseract_create(tessdata_path,
language=self.params["ocr"]["language"].value,
char_whitelist=self.params["ocr"]["char_whitelist"].value,
oem=self.params["ocr"]["oem"].value,
psmode=self.params["ocr"]["psmode"].value)
kwargs = {"language": self.params["ocr"]["language"].value,
"char_whitelist": self.params["ocr"]["char_whitelist"].value,
"oem": self.params["ocr"]["oem"].value,
"psmode": self.params["ocr"]["psmode"].value}
if tessdata_path:
self.ocr = cv2.text.OCRTesseract_create(datapath, **kwargs)
else:
self.ocr = cv2.text.OCRTesseract_create(**kwargs)
elif backend in ["hmm", "beamSearch"]:

import numpy
Expand Down
Binary file removed misc/tessdata/deu.traineddata
Binary file not shown.
Binary file removed misc/tessdata/eng.traineddata
Binary file not shown.
2 changes: 1 addition & 1 deletion packaging/guibot.spec
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ pushd packaging
%{__python3} setup.py install --root %{buildroot}
popd
%{__install} -d %{buildroot}%{python3_sitelib}/guibot/tests/images
%{__install} -d %{buildroot}%{python3_sitelib}/guibot/misc/tessdata
%{__install} -d %{buildroot}%{python3_sitelib}/guibot/misc
%{__cp} -a tests/* %{buildroot}%{python3_sitelib}/guibot/tests
%{__cp} -a misc/* %{buildroot}%{python3_sitelib}/guibot/misc

Expand Down

0 comments on commit b27f315

Please sign in to comment.