Skip to content

Commit

Permalink
add contributed models
Browse files Browse the repository at this point in the history
  • Loading branch information
pachadotdev committed Sep 23, 2024
1 parent 17df116 commit 6141f79
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 116 deletions.
4 changes: 4 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ vignettes/.*\.png$
^configure.log$
^\.github$
^\.vscode$
^README\.Rmd$
^README\.html$
^docs$
^pkgdown$
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ S3method(print,tesseract)
export(ocr)
export(ocr_data)
export(tesseract)
export(tesseract_contributed_download)
export(tesseract_download)
export(tesseract_info)
export(tesseract_params)
Expand Down
102 changes: 0 additions & 102 deletions NEWS

This file was deleted.

10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# 5.3.0
- This is a fork of the original tesseract package made by Jeroen Ooms.
- Differences:
- Uses cpp11 instead of Rcpp.
- Provides functions to download and use the slower but more accurate
models.
- Provides functions to download and use contributed models.
- The documentation works offline.
- Explicitly tests on Ubuntu with Tesseract 4 (Ubuntu 22.04 default) and
Tesseract 5 (PPA).
97 changes: 84 additions & 13 deletions R/tessdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@
#' @param progress print progress while downloading
#' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
#' @examples \dontrun{
#' if(is.na(match("fra", tesseract_info()$available)))
#' if (is.na(match("fra", tesseract_info()$available))) {
#' tesseract_download("fra", model = "best")
#' }
#' french <- tesseract("fra")
#' file <- system.file("examples", "french.png", package = "cpp11tesseract")
#' text <- ocr(file, engine = french)
Expand All @@ -37,57 +38,127 @@
tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) {
stopifnot(is.character(lang))
model <- match.arg(model)
if(!length(datapath)){
if (!length(datapath)) {
warn_on_linux()
datapath <- tesseract_info()$datapath
}
datapath <- normalizePath(datapath, mustWork = TRUE)
version <- tesseract_version_major()

if(version < 4){
if (version < 4) {
repo <- "tessdata"
release <- "3.04.00"
} else {
repo <- paste0("tessdata_", model)
release <- "4.1.0"
}

url <- sprintf("https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata", repo, release, lang)
download_helper(url, datapath, progress)
}

#' Tesseract Contributed Training Data
#'
#' Helper function to download training data from the contributed
#' [tessdata_contrib](https://github.com/tesseract-ocr/tessdata_contrib) repository.
#'
#' @export
#' @aliases tessdata
#' @rdname tessdata
#' @family tesseract
#' @seealso [tesseract_download]
#' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
#' @param datapath destination directory where to download store the file
#' @param model either `fast` or `best` is currently supported. The latter downloads
#' more accurate (but slower) trained models for Tesseract 4.0 or higher
#' @param progress print progress while downloading
#' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
#' @examples
#' \dontrun{
#' if (is.na(match("grc_hist", tesseract_info()$available))) {
#' tesseract_contributed_download("grc_hist")
#' }
#' greek <- tesseract("grc_hist")
#' file <- system.file("examples", "polytonicgreek.png", package = "cpp11tesseract")
#' text <- ocr(file, engine = greek)
#' cat(text)
#' }
tesseract_contributed_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) {
stopifnot(is.character(lang))
if (!any(lang %in% c("grc_hist", "akk"))) {
stop("The only available contributed models are Akkadian and Polytonic Greek (for now).", call. = FALSE)
}
model <- match.arg(model)
if (!length(datapath)) {
warn_on_linux()
datapath <- tesseract_info()$datapath
}
datapath <- normalizePath(datapath, mustWork = TRUE)
version <- tesseract_version_major()

if (lang == "grc_hist" && version < 4) {
stop("The Polytonic Greek model is only available for Tesseract 4.0 or higher.", call. = FALSE)
}

if (lang == "grc_hist") {
if (model == "fast") {
warning("The Polytonic Greek model is only available in 'best' quality.", call. = FALSE)
}
release <- "grc_hist/best"
}

if (lang == "akk" && version < 4) {
release <- "akk/legacy"
} else if (lang == "akk" && model == "best") {
release <- "akk/best"
} else if (lang == "akk" && model == "fast") {
release <- "akk/fast"
}

url <- sprintf("https://github.com/tesseract-ocr/tessdata_contrib/raw/main/%s/%s.traineddata", release, lang)
print(url)

download_helper(url, datapath, progress)
}

download_helper <- function(url, datapath, progress) {
destfile <- file.path(datapath, basename(url))

if (file.exists(destfile)) {
message(paste("Training data already exists. Overwriting", destfile))
message("The training data already exists. Skipping download.")
return(destfile)
}

req <- curl::curl_fetch_memory(url, curl::new_handle(
progressfunction = progress_fun,
noprogress = !isTRUE(progress)
))
if(progress)

if (progress) {
cat("\n")
if(req$status_code != 200)
}
if (req$status_code != 200) {
stop("Download failed: HTTP ", req$status_code, call. = FALSE)

}
writeBin(req$content, destfile)
return(destfile)
}

progress_fun <- function(down, up) {
total <- down[[1]]
now <- down[[2]]
pct <- if(length(total) && total > 0){
paste0("(", round(now/total * 100), "%)")
pct <- if (length(total) && total > 0) {
paste0("(", round(now / total * 100), "%)")
} else {
""
}
if(now > 10000)
if (now > 10000) {
cat("\r Downloaded:", sprintf("%.2f", now / 2^20), "MB ", pct)
}
TRUE
}

warn_on_linux <- function(){
if(identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")){
warn_on_linux <- function() {
if (identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")) {
warning("On Linux you should install training data via yum/apt. Please check the manual page.", call. = FALSE)
}
}
31 changes: 30 additions & 1 deletion man/tessdata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 6141f79

Please sign in to comment.