From 59d4e34d0cbc0cbe4c027fca7a335bc755939912 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mauricio=20=22Pach=C3=A1=22=20Vargas=20Sep=C3=BAlveda?=
 <m.sepulveda@mail.utoronto.ca>
Date: Sat, 3 Aug 2024 14:13:32 -0400
Subject: [PATCH] Add option to download best models with Tesseract 4 or higher
 (#64)

---
 DESCRIPTION     |  2 +-
 R/tessdata.R    | 25 ++++++++++++++++++-------
 man/ocr.Rd      |  4 ++--
 man/tessdata.Rd | 15 +++++++++++----
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 838561b..1bef9ef 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -22,7 +22,7 @@ Imports:
     rappdirs,
     digest
 LinkingTo: Rcpp
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Roxygen: list(markdown = TRUE)
 Suggests:
     magick (>= 1.7),
diff --git a/R/tessdata.R b/R/tessdata.R
index 9ffeedf..1859c3a 100644
--- a/R/tessdata.R
+++ b/R/tessdata.R
@@ -1,8 +1,7 @@
 #' Tesseract Training Data
 #'
 #' Helper function to download training data from the official
-#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. Only use this function on
-#' Windows and OS-X. On Linux, training data can be installed directly with
+#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. On Linux, the fast training data can be installed directly with
 #' [yum](https://src.fedoraproject.org/rpms/tesseract) or
 #' [apt-get](https://packages.debian.org/search?suite=stable&section=all&arch=any&searchon=names&keywords=tesseract-ocr-).
 #'
@@ -23,31 +22,43 @@
 #' @family tesseract
 #' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
 #' @param datapath destination directory where to download store the file
+#' @param model either `fast` or `best` is currently supported. The latter downloads
+#' more accurate (but slower) trained models for Tesseract 4.0 or higher
 #' @param progress print progress while downloading
 #' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
 #' @examples \dontrun{
 #' if(is.na(match("fra", tesseract_info()$available)))
-#'   tesseract_download("fra")
+#'   tesseract_download("fra", model = 'best')
 #' french <- tesseract("fra")
 #' text <- ocr("https://jeroen.github.io/images/french_text.png", engine = french)
 #' cat(text)
 #' }
-tesseract_download <- function(lang, datapath = NULL, progress = interactive()){
+tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) {
   stopifnot(is.character(lang))
+  model <- match.arg(model)
   if(!length(datapath)){
     warn_on_linux()
     datapath <- tesseract_info()$datapath
   }
   datapath <- normalizePath(datapath, mustWork = TRUE)
   version <- tesseract_version_major()
+
   if(version < 4){
     repo <- "tessdata"
     release <- "3.04.00"
   } else {
-    repo <- "tessdata_fast"
+    repo <- paste0("tessdata_", model)
     release <- "4.1.0"
   }
-  url <- sprintf('https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata', repo, release, lang)
+
+  url <- sprintf("https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata", repo, release, lang)
+
+  destfile <- file.path(datapath, basename(url))
+
+  if (file.exists(destfile)) {
+    message(paste("Training data already exists. Overwriting", destfile))
+  }
+
   req <- curl::curl_fetch_memory(url, curl::new_handle(
     progressfunction = progress_fun,
     noprogress = !isTRUE(progress)
@@ -56,7 +67,7 @@ tesseract_download <- function(lang, datapath = NULL, progress = interactive()){
     cat("\n")
   if(req$status_code != 200)
     stop("Download failed: HTTP ", req$status_code, call. = FALSE)
-  destfile <- file.path(datapath, basename(url))
+
   writeBin(req$content, destfile)
   return(destfile)
 }
diff --git a/man/ocr.Rd b/man/ocr.Rd
index 1be077b..5d0ca9f 100644
--- a/man/ocr.Rd
+++ b/man/ocr.Rd
@@ -61,7 +61,7 @@ engine <- tesseract(options = list(tessedit_char_whitelist = "0123456789"))
 }
 \seealso{
 Other tesseract: 
-\code{\link{tesseract_download}()},
-\code{\link{tesseract}()}
+\code{\link{tesseract}()},
+\code{\link{tesseract_download}()}
 }
 \concept{tesseract}
diff --git a/man/tessdata.Rd b/man/tessdata.Rd
index d74e3d9..627bf57 100644
--- a/man/tessdata.Rd
+++ b/man/tessdata.Rd
@@ -5,19 +5,26 @@
 \alias{tessdata}
 \title{Tesseract Training Data}
 \usage{
-tesseract_download(lang, datapath = NULL, progress = interactive())
+tesseract_download(
+  lang,
+  datapath = NULL,
+  model = c("fast", "best"),
+  progress = interactive()
+)
 }
 \arguments{
 \item{lang}{three letter code for language, see \href{https://github.com/tesseract-ocr/tessdata}{tessdata} repository.}
 
 \item{datapath}{destination directory where to download store the file}
 
+\item{model}{either \code{fast} or \code{best} is currently supported. The latter downloads
+more accurate (but slower) trained models for Tesseract 4.0 or higher}
+
 \item{progress}{print progress while downloading}
 }
 \description{
 Helper function to download training data from the official
-\href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tessdata} repository. Only use this function on
-Windows and OS-X. On Linux, training data can be installed directly with
+\href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tessdata} repository. On Linux, the fast training data can be installed directly with
 \href{https://src.fedoraproject.org/rpms/tesseract}{yum} or
 \href{https://packages.debian.org/search?suite=stable&section=all&arch=any&searchon=names&keywords=tesseract-ocr-}{apt-get}.
 }
@@ -37,7 +44,7 @@ and stores it in a the path on disk given by the \code{TESSDATA_PREFIX} variable
 \examples{
 \dontrun{
 if(is.na(match("fra", tesseract_info()$available)))
-  tesseract_download("fra")
+  tesseract_download("fra", model = 'best')
 french <- tesseract("fra")
 text <- ocr("https://jeroen.github.io/images/french_text.png", engine = french)
 cat(text)