From 59d4e34d0cbc0cbe4c027fca7a335bc755939912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mauricio=20=22Pach=C3=A1=22=20Vargas=20Sep=C3=BAlveda?= Date: Sat, 3 Aug 2024 14:13:32 -0400 Subject: [PATCH] Add option to download best models with Tesseract 4 or higher (#64) --- DESCRIPTION | 2 +- R/tessdata.R | 25 ++++++++++++++++++------- man/ocr.Rd | 4 ++-- man/tessdata.Rd | 15 +++++++++++---- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 838561b..1bef9ef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,7 @@ Imports: rappdirs, digest LinkingTo: Rcpp -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 Roxygen: list(markdown = TRUE) Suggests: magick (>= 1.7), diff --git a/R/tessdata.R b/R/tessdata.R index 9ffeedf..1859c3a 100644 --- a/R/tessdata.R +++ b/R/tessdata.R @@ -1,8 +1,7 @@ #' Tesseract Training Data #' #' Helper function to download training data from the official -#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. Only use this function on -#' Windows and OS-X. On Linux, training data can be installed directly with +#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. On Linux, the fast training data can be installed directly with #' [yum](https://src.fedoraproject.org/rpms/tesseract) or #' [apt-get](https://packages.debian.org/search?suite=stable§ion=all&arch=any&searchon=names&keywords=tesseract-ocr-). #' @@ -23,31 +22,43 @@ #' @family tesseract #' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository. #' @param datapath destination directory where to download store the file +#' @param model either `fast` or `best` is currently supported. The latter downloads +#' more accurate (but slower) trained models for Tesseract 4.0 or higher #' @param progress print progress while downloading #' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files) #' @examples \dontrun{ #' if(is.na(match("fra", tesseract_info()$available))) -#' tesseract_download("fra") +#' tesseract_download("fra", model = 'best') #' french <- tesseract("fra") #' text <- ocr("https://jeroen.github.io/images/french_text.png", engine = french) #' cat(text) #' } -tesseract_download <- function(lang, datapath = NULL, progress = interactive()){ +tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) { stopifnot(is.character(lang)) + model <- match.arg(model) if(!length(datapath)){ warn_on_linux() datapath <- tesseract_info()$datapath } datapath <- normalizePath(datapath, mustWork = TRUE) version <- tesseract_version_major() + if(version < 4){ repo <- "tessdata" release <- "3.04.00" } else { - repo <- "tessdata_fast" + repo <- paste0("tessdata_", model) release <- "4.1.0" } - url <- sprintf('https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata', repo, release, lang) + + url <- sprintf("https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata", repo, release, lang) + + destfile <- file.path(datapath, basename(url)) + + if (file.exists(destfile)) { + message(paste("Training data already exists. Overwriting", destfile)) + } + req <- curl::curl_fetch_memory(url, curl::new_handle( progressfunction = progress_fun, noprogress = !isTRUE(progress) @@ -56,7 +67,7 @@ tesseract_download <- function(lang, datapath = NULL, progress = interactive()){ cat("\n") if(req$status_code != 200) stop("Download failed: HTTP ", req$status_code, call. = FALSE) - destfile <- file.path(datapath, basename(url)) + writeBin(req$content, destfile) return(destfile) } diff --git a/man/ocr.Rd b/man/ocr.Rd index 1be077b..5d0ca9f 100644 --- a/man/ocr.Rd +++ b/man/ocr.Rd @@ -61,7 +61,7 @@ engine <- tesseract(options = list(tessedit_char_whitelist = "0123456789")) } \seealso{ Other tesseract: -\code{\link{tesseract_download}()}, -\code{\link{tesseract}()} +\code{\link{tesseract}()}, +\code{\link{tesseract_download}()} } \concept{tesseract} diff --git a/man/tessdata.Rd b/man/tessdata.Rd index d74e3d9..627bf57 100644 --- a/man/tessdata.Rd +++ b/man/tessdata.Rd @@ -5,19 +5,26 @@ \alias{tessdata} \title{Tesseract Training Data} \usage{ -tesseract_download(lang, datapath = NULL, progress = interactive()) +tesseract_download( + lang, + datapath = NULL, + model = c("fast", "best"), + progress = interactive() +) } \arguments{ \item{lang}{three letter code for language, see \href{https://github.com/tesseract-ocr/tessdata}{tessdata} repository.} \item{datapath}{destination directory where to download store the file} +\item{model}{either \code{fast} or \code{best} is currently supported. The latter downloads +more accurate (but slower) trained models for Tesseract 4.0 or higher} + \item{progress}{print progress while downloading} } \description{ Helper function to download training data from the official -\href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tessdata} repository. Only use this function on -Windows and OS-X. On Linux, training data can be installed directly with +\href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tessdata} repository. On Linux, the fast training data can be installed directly with \href{https://src.fedoraproject.org/rpms/tesseract}{yum} or \href{https://packages.debian.org/search?suite=stable§ion=all&arch=any&searchon=names&keywords=tesseract-ocr-}{apt-get}. } @@ -37,7 +44,7 @@ and stores it in a the path on disk given by the \code{TESSDATA_PREFIX} variable \examples{ \dontrun{ if(is.na(match("fra", tesseract_info()$available))) - tesseract_download("fra") + tesseract_download("fra", model = 'best') french <- tesseract("fra") text <- ocr("https://jeroen.github.io/images/french_text.png", engine = french) cat(text)