diff --git a/.Rbuildignore b/.Rbuildignore index fddda98..a13bd41 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,7 @@ vignettes/.*\.png$ ^configure.log$ ^\.github$ ^\.vscode$ +^README\.Rmd$ +^README\.html$ +^docs$ +^pkgdown$ diff --git a/NAMESPACE b/NAMESPACE index 920705d..3924c0a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ S3method(print,tesseract) export(ocr) export(ocr_data) export(tesseract) +export(tesseract_contributed_download) export(tesseract_download) export(tesseract_info) export(tesseract_params) diff --git a/NEWS b/NEWS deleted file mode 100644 index e32bac7..0000000 --- a/NEWS +++ /dev/null @@ -1,102 +0,0 @@ -5.2.1 - - Fix shell script for cross compilation - -5.2.0 - - Windows: update to tesseract 5.3.2 - -5.1.0 - - Win: update to tesseract 5.1.0. - - Win: apply patch for freezes when running under UTF-8 in R-4.2. - See: https://github.com/tesseract-ocr/tesseract/issues/3830 - -5.0.0 - - Win/Mac: update to libtesseract 5.0.1 - - Remove locale workaround on libtesseract 4.1+ (should only be needed for 4.0) - - Remove cruft that was needed to support Solaris - -4.2.0 - - Prepare for API changes in upcoming Tesseract 5 release - - Change the default language="eng" in tesseract() - -4.1.2 - - Fix for upstream master/main renames in language repos - -4.1.1 - - Win/Mac: update to libtesseract 4.1.1 - -4.1 - - Fix memory leak in ocr_data() - - Windows / MacOS: update to libtesseract 4.1.0. This re-enables - the whitelist/blacklist options that were missing in Tesseract 4.0 - -4.0 - - Windows, MacOS: Upgrade to upstream Tesseract 4.0! Completely new OCR engine. - - Tesseract 4 has a new training data format. On Windows / MacOS you need to - re-download your language data with tesseract_download(). The package uses - separate directories for storing Tesseract 3 vs 4 data so they shouldn't get - mixed up (hopefully). - - Drop hard-dependency on tibble (only load if available) - -2.3 - - Fix problem with setlocale() not properly restoring locale. - - Switch examples from dontrun{} to donttest{}, and '--run-donttest' on travis/appveyor - -2.2 - - Fixes for breaking changes in Tesseract 4.0.0 beta.3 - - Set LC_ALL = C when initiating tesseract - - Include to support Tesseract 4 - -2.1 - - Fixes for 4.0.0-beta.1: they switched to semver + other data branch - - Set LC_CTYPE to "C" when loading training data (required for some asian languages) - - Add back OSD training data on Windows - -2.0 - - Set tesseract parameters at init so that all parameters types now actually work! - - New function tesseract_params() lists all supported parameters and their default - - Added 'config' argument to tesseract() which specifies a file with parameter values - - Internally validate paremeter names before init to revent tesseract crashes - - Rewrite the ocr_data() function in C++ to make it much faster - - Tesseract 4 now gets data from the tessdata_fast repo as recommended upstream - - Use default resolution of 300dpi when image does not contain resolution info - -1.9 - - Tesseract 4 now dowloads training data from the "tessdata_fast" repo - - Add ocr_data() function that parses the hOCR output - -1.8 - - Add support for HOCR output (#20) - - Remove 'script' and 'orientation' attributes in output (doesn't work in Tesseract 4) - -1.7 (internal) - - Add support upcoming Tesseract 4 (compiler fix + separate tessdata dir) - - Configure script now explicitly tests for CXX11 (required by Tesseract 4) - -1.6 - - Windows: update libtesseract to 3.05.01 - - tesseract_download now uses 3.04 tree (instead of 4.00) as suggested in readme - - For static packags on Win/Mac, languages stored in: rappdirs::user_data_dir('tesseract') - - Use 'png' instead of 'tiff' to read magick images - - Compile with $(C_VISIBILITY) to hide internal symbols (requires Rcpp 0.12.12) - - Use Rcpp symbol registration - -1.4 - - Run engine finalizer on R exit (requires Rcpp 0.12.10) - - Move autobrew script to separate repository - - Add symbol registration - -1.3 - - tesseract() gains an 'options' parameter for setting engine variables - - New tessseract_download() function for installing training data on Win/Mac - - Initiate default tesseract engine onAttach() to fail for missing training data - - Add support for ocr() on magick images - -1.2 - - Try to fix build for CRAN OS-X, again. - -1.1 - - Try to fix build for CRAN OS-X build server - - Show 'loaded' and 'available' languages in print.tesseract() - -1.0 - - Initial CRAN release diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..82f1751 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,10 @@ +# 5.3.0 + - This is a fork of the original tesseract package made by Jeroen Ooms. + - Differences: + - Uses cpp11 instead of Rcpp. + - Provides functions to download and use the slower but more accurate + models. + - Provides functions to download and use contributed models. + - The documentation works offline. + - Explicitly tests on Ubuntu with Tesseract 4 (Ubuntu 22.04 default) and + Tesseract 5 (PPA). diff --git a/R/tessdata.R b/R/tessdata.R index 4d8fc8a..3ad0607 100644 --- a/R/tessdata.R +++ b/R/tessdata.R @@ -27,8 +27,9 @@ #' @param progress print progress while downloading #' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files) #' @examples \dontrun{ -#' if(is.na(match("fra", tesseract_info()$available))) +#' if (is.na(match("fra", tesseract_info()$available))) { #' tesseract_download("fra", model = "best") +#' } #' french <- tesseract("fra") #' file <- system.file("examples", "french.png", package = "cpp11tesseract") #' text <- ocr(file, engine = french) @@ -37,38 +38,107 @@ tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) { stopifnot(is.character(lang)) model <- match.arg(model) - if(!length(datapath)){ + if (!length(datapath)) { warn_on_linux() datapath <- tesseract_info()$datapath } datapath <- normalizePath(datapath, mustWork = TRUE) version <- tesseract_version_major() - if(version < 4){ + if (version < 4) { repo <- "tessdata" release <- "3.04.00" } else { repo <- paste0("tessdata_", model) release <- "4.1.0" } - url <- sprintf("https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata", repo, release, lang) + download_helper(url, datapath, progress) +} +#' Tesseract Contributed Training Data +#' +#' Helper function to download training data from the contributed +#' [tessdata_contrib](https://github.com/tesseract-ocr/tessdata_contrib) repository. +#' +#' @export +#' @aliases tessdata +#' @rdname tessdata +#' @family tesseract +#' @seealso [tesseract_download] +#' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository. +#' @param datapath destination directory where to download store the file +#' @param model either `fast` or `best` is currently supported. The latter downloads +#' more accurate (but slower) trained models for Tesseract 4.0 or higher +#' @param progress print progress while downloading +#' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files) +#' @examples +#' \dontrun{ +#' if (is.na(match("grc_hist", tesseract_info()$available))) { +#' tesseract_contributed_download("grc_hist") +#' } +#' greek <- tesseract("grc_hist") +#' file <- system.file("examples", "polytonicgreek.png", package = "cpp11tesseract") +#' text <- ocr(file, engine = greek) +#' cat(text) +#' } +tesseract_contributed_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) { + stopifnot(is.character(lang)) + if (!any(lang %in% c("grc_hist", "akk"))) { + stop("The only available contributed models are Akkadian and Polytonic Greek (for now).", call. = FALSE) + } + model <- match.arg(model) + if (!length(datapath)) { + warn_on_linux() + datapath <- tesseract_info()$datapath + } + datapath <- normalizePath(datapath, mustWork = TRUE) + version <- tesseract_version_major() + + if (lang == "grc_hist" && version < 4) { + stop("The Polytonic Greek model is only available for Tesseract 4.0 or higher.", call. = FALSE) + } + + if (lang == "grc_hist") { + if (model == "fast") { + warning("The Polytonic Greek model is only available in 'best' quality.", call. = FALSE) + } + release <- "grc_hist/best" + } + + if (lang == "akk" && version < 4) { + release <- "akk/legacy" + } else if (lang == "akk" && model == "best") { + release <- "akk/best" + } else if (lang == "akk" && model == "fast") { + release <- "akk/fast" + } + + url <- sprintf("https://github.com/tesseract-ocr/tessdata_contrib/raw/main/%s/%s.traineddata", release, lang) + print(url) + + download_helper(url, datapath, progress) +} + +download_helper <- function(url, datapath, progress) { destfile <- file.path(datapath, basename(url)) if (file.exists(destfile)) { - message(paste("Training data already exists. Overwriting", destfile)) + message("The training data already exists. Skipping download.") + return(destfile) } req <- curl::curl_fetch_memory(url, curl::new_handle( progressfunction = progress_fun, noprogress = !isTRUE(progress) )) - if(progress) + + if (progress) { cat("\n") - if(req$status_code != 200) + } + if (req$status_code != 200) { stop("Download failed: HTTP ", req$status_code, call. = FALSE) - + } writeBin(req$content, destfile) return(destfile) } @@ -76,18 +146,19 @@ tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress_fun <- function(down, up) { total <- down[[1]] now <- down[[2]] - pct <- if(length(total) && total > 0){ - paste0("(", round(now/total * 100), "%)") + pct <- if (length(total) && total > 0) { + paste0("(", round(now / total * 100), "%)") } else { "" } - if(now > 10000) + if (now > 10000) { cat("\r Downloaded:", sprintf("%.2f", now / 2^20), "MB ", pct) + } TRUE } -warn_on_linux <- function(){ - if(identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")){ +warn_on_linux <- function() { + if (identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")) { warning("On Linux you should install training data via yum/apt. Please check the manual page.", call. = FALSE) } } diff --git a/man/tessdata.Rd b/man/tessdata.Rd index 8bbcb55..25b8193 100644 --- a/man/tessdata.Rd +++ b/man/tessdata.Rd @@ -3,6 +3,7 @@ \name{tesseract_download} \alias{tesseract_download} \alias{tessdata} +\alias{tesseract_contributed_download} \title{Tesseract Training Data} \usage{ tesseract_download( @@ -11,6 +12,13 @@ tesseract_download( model = c("fast", "best"), progress = interactive() ) + +tesseract_contributed_download( + lang, + datapath = NULL, + model = c("fast", "best"), + progress = interactive() +) } \arguments{ \item{lang}{three letter code for language, see \href{https://github.com/tesseract-ocr/tessdata}{tessdata} repository.} @@ -27,6 +35,9 @@ Helper function to download training data from the official \href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tessdata} repository. On Linux, the fast training data can be installed directly with \href{https://src.fedoraproject.org/rpms/tesseract}{yum} or \href{https://packages.debian.org/search?suite=stable§ion=all&arch=any&searchon=names&keywords=tesseract-ocr-}{apt-get}. + +Helper function to download training data from the contributed +\href{https://github.com/tesseract-ocr/tessdata_contrib}{tessdata_contrib} repository. } \details{ Tesseract uses training data to perform OCR. Most systems default to English @@ -43,18 +54,36 @@ and stores it in a the path on disk given by the \code{TESSDATA_PREFIX} variable } \examples{ \dontrun{ -if(is.na(match("fra", tesseract_info()$available))) +if (is.na(match("fra", tesseract_info()$available))) { tesseract_download("fra", model = "best") +} french <- tesseract("fra") file <- system.file("examples", "french.png", package = "cpp11tesseract") text <- ocr(file, engine = french) cat(text) } +\dontrun{ +if (is.na(match("grc_hist", tesseract_info()$available))) { + tesseract_contributed_download("grc_hist") +} +greek <- tesseract("grc_hist") +file <- system.file("examples", "polytonicgreek.png", package = "cpp11tesseract") +text <- ocr(file, engine = greek) +cat(text) +} } \references{ +\href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tesseract wiki: training data} + \href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tesseract wiki: training data} } \seealso{ +\link{tesseract_download} + +Other tesseract: +\code{\link{ocr}()}, +\code{\link{tesseract}()} + Other tesseract: \code{\link{ocr}()}, \code{\link{tesseract}()}