feat: Add multilingual support for ocr module (#3325)

* Add multilingual support for ocr mudule * Add OCR langauge as server var that is passed into Collector Support all valid tesseract language codes Filter and parse only valid codes with fallbacks' * persist TARGET_OCR_LANG * update docker example env --------- Co-authored-by: Timothy Carambat <[email protected]>
Mintplex-Labs · Feb 27, 2025 · df166eb · df166eb
1 parent c928d3d
commit df166eb
Show file tree

Hide file tree

Showing 8 changed files with 229 additions and 7 deletions.
diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js
@@ -8,8 +8,10 @@ const {
 const OCRLoader = require("../../utils/OCRLoader");
 const { default: slugify } = require("slugify");
 
-async function asImage({ fullFilePath = "", filename = "" }) {
-  let content = await new OCRLoader().ocrImage(fullFilePath);
+async function asImage({ fullFilePath = "", filename = "", options = {} }) {
+  let content = await new OCRLoader({
+    targetLanguages: options?.ocr?.langList,
+  }).ocrImage(fullFilePath);
 
   if (!content?.length) {
     console.error(`Resulting text content was empty for ${filename}.`);

diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js
@@ -9,7 +9,7 @@ const { default: slugify } = require("slugify");
 const PDFLoader = require("./PDFLoader");
 const OCRLoader = require("../../../utils/OCRLoader");
 
-async function asPdf({ fullFilePath = "", filename = "" }) {
+async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
   const pdfLoader = new PDFLoader(fullFilePath, {
     splitPages: true,
   });
@@ -22,7 +22,9 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
     console.log(
       `[asPDF] No text content found for ${filename}. Will attempt OCR parse.`
     );
-    docs = await new OCRLoader().ocrPDF(fullFilePath);
+    docs = await new OCRLoader({
+      targetLanguages: options?.ocr?.langList,
+    }).ocrPDF(fullFilePath);
   }
 
   for (const doc of docs) {

diff --git a/collector/utils/OCRLoader/index.js b/collector/utils/OCRLoader/index.js
@@ -1,14 +1,61 @@
 const fs = require("fs");
 const os = require("os");
 const path = require("path");
+const { VALID_LANGUAGE_CODES } = require("./validLangs");
 
 class OCRLoader {
-  constructor() {
+  /**
+   * The language code(s) to use for the OCR.
+   * @type {string[]}
+   */
+  language;
+  /**
+   * The cache directory for the OCR.
+   * @type {string}
+   */
+  cacheDir;
+
+  /**
+   * The constructor for the OCRLoader.
+   * @param {Object} options - The options for the OCRLoader.
+   * @param {string} options.targetLanguages - The target languages to use for the OCR as a comma separated string. eg: "eng,deu,..."
+   */
+  constructor({ targetLanguages = "eng" } = {}) {
+    this.language = this.parseLanguages(targetLanguages);
     this.cacheDir = path.resolve(
       process.env.STORAGE_DIR
         ? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
         : path.resolve(__dirname, `../../../server/storage/models/tesseract`)
     );
+
+    // Ensure the cache directory exists or else Tesseract will persist the cache in the default location.
+    if (!fs.existsSync(this.cacheDir))
+      fs.mkdirSync(this.cacheDir, { recursive: true });
+    this.log(
+      `OCRLoader initialized with language support for:`,
+      this.language.map((lang) => VALID_LANGUAGE_CODES[lang]).join(", ")
+    );
+  }
+
+  /**
+   * Parses the language code from a provided comma separated string of language codes.
+   * @param {string} language - The language code to parse.
+   * @returns {string[]} The parsed language code.
+   */
+  parseLanguages(language = null) {
+    try {
+      if (!language || typeof language !== "string") return ["eng"];
+      const langList = language
+        .split(",")
+        .map((lang) => (lang.trim() !== "" ? lang.trim() : null))
+        .filter(Boolean)
+        .filter((lang) => VALID_LANGUAGE_CODES.hasOwnProperty(lang));
+      if (langList.length === 0) return ["eng"];
+      return langList;
+    } catch (e) {
+      this.log(`Error parsing languages: ${e.message}`, e.stack);
+      return ["eng"];
+    }
   }
 
   log(text, ...args) {
@@ -70,7 +117,7 @@ class OCRLoader {
       Array(NUM_WORKERS)
         .fill(0)
         .map(() =>
-          createWorker("eng", OEM.LSTM_ONLY, {
+          createWorker(this.language, OEM.LSTM_ONLY, {
             cachePath: this.cacheDir,
           })
         )
@@ -188,7 +235,7 @@ class OCRLoader {
       this.log(`Starting OCR of ${documentTitle}`);
       const startTime = Date.now();
       const { createWorker, OEM } = require("tesseract.js");
-      worker = await createWorker("eng", OEM.LSTM_ONLY, {
+      worker = await createWorker(this.language, OEM.LSTM_ONLY, {
         cachePath: this.cacheDir,
       });
 

diff --git a/collector/utils/OCRLoader/validLangs.js b/collector/utils/OCRLoader/validLangs.js
@@ -0,0 +1,155 @@
+/*
+
+To get the list of valid language codes - do the following:
+Open the following URL in your browser: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
+
+Check this element is the proper table tbody with all the codes via console:
+document.getElementsByTagName('table').item(0).children.item(1)
+
+Now, copy the following code and paste it into the console:
+function parseLangs() {
+let langs = {};
+  Array.from(document.getElementsByTagName('table').item(0).children.item(1).children).forEach((el) => {
+    const [codeEl, languageEl, ...rest] = el.children
+    const code = codeEl.innerText.trim()
+    const language = languageEl.innerText.trim()
+    if (!!code && !!language) langs[code] = language
+  })
+  return langs;
+}
+
+now, run the function:
+copy(parseLangs())
+*/
+
+const VALID_LANGUAGE_CODES = {
+  afr: "Afrikaans",
+  amh: "Amharic",
+  ara: "Arabic",
+  asm: "Assamese",
+  aze: "Azerbaijani",
+  aze_cyrl: "Azerbaijani - Cyrilic",
+  bel: "Belarusian",
+  ben: "Bengali",
+  bod: "Tibetan",
+  bos: "Bosnian",
+  bre: "Breton",
+  bul: "Bulgarian",
+  cat: "Catalan; Valencian",
+  ceb: "Cebuano",
+  ces: "Czech",
+  chi_sim: "Chinese - Simplified",
+  chi_tra: "Chinese - Traditional",
+  chr: "Cherokee",
+  cos: "Corsican",
+  cym: "Welsh",
+  dan: "Danish",
+  dan_frak: "Danish - Fraktur (contrib)",
+  deu: "German",
+  deu_frak: "German - Fraktur (contrib)",
+  deu_latf: "German (Fraktur Latin)",
+  dzo: "Dzongkha",
+  ell: "Greek, Modern (1453-)",
+  eng: "English",
+  enm: "English, Middle (1100-1500)",
+  epo: "Esperanto",
+  equ: "Math / equation detection module",
+  est: "Estonian",
+  eus: "Basque",
+  fao: "Faroese",
+  fas: "Persian",
+  fil: "Filipino (old - Tagalog)",
+  fin: "Finnish",
+  fra: "French",
+  frk: "German - Fraktur (now deu_latf)",
+  frm: "French, Middle (ca.1400-1600)",
+  fry: "Western Frisian",
+  gla: "Scottish Gaelic",
+  gle: "Irish",
+  glg: "Galician",
+  grc: "Greek, Ancient (to 1453) (contrib)",
+  guj: "Gujarati",
+  hat: "Haitian; Haitian Creole",
+  heb: "Hebrew",
+  hin: "Hindi",
+  hrv: "Croatian",
+  hun: "Hungarian",
+  hye: "Armenian",
+  iku: "Inuktitut",
+  ind: "Indonesian",
+  isl: "Icelandic",
+  ita: "Italian",
+  ita_old: "Italian - Old",
+  jav: "Javanese",
+  jpn: "Japanese",
+  kan: "Kannada",
+  kat: "Georgian",
+  kat_old: "Georgian - Old",
+  kaz: "Kazakh",
+  khm: "Central Khmer",
+  kir: "Kirghiz; Kyrgyz",
+  kmr: "Kurmanji (Kurdish - Latin Script)",
+  kor: "Korean",
+  kor_vert: "Korean (vertical)",
+  kur: "Kurdish (Arabic Script)",
+  lao: "Lao",
+  lat: "Latin",
+  lav: "Latvian",
+  lit: "Lithuanian",
+  ltz: "Luxembourgish",
+  mal: "Malayalam",
+  mar: "Marathi",
+  mkd: "Macedonian",
+  mlt: "Maltese",
+  mon: "Mongolian",
+  mri: "Maori",
+  msa: "Malay",
+  mya: "Burmese",
+  nep: "Nepali",
+  nld: "Dutch; Flemish",
+  nor: "Norwegian",
+  oci: "Occitan (post 1500)",
+  ori: "Oriya",
+  osd: "Orientation and script detection module",
+  pan: "Panjabi; Punjabi",
+  pol: "Polish",
+  por: "Portuguese",
+  pus: "Pushto; Pashto",
+  que: "Quechua",
+  ron: "Romanian; Moldavian; Moldovan",
+  rus: "Russian",
+  san: "Sanskrit",
+  sin: "Sinhala; Sinhalese",
+  slk: "Slovak",
+  slk_frak: "Slovak - Fraktur (contrib)",
+  slv: "Slovenian",
+  snd: "Sindhi",
+  spa: "Spanish; Castilian",
+  spa_old: "Spanish; Castilian - Old",
+  sqi: "Albanian",
+  srp: "Serbian",
+  srp_latn: "Serbian - Latin",
+  sun: "Sundanese",
+  swa: "Swahili",
+  swe: "Swedish",
+  syr: "Syriac",
+  tam: "Tamil",
+  tat: "Tatar",
+  tel: "Telugu",
+  tgk: "Tajik",
+  tgl: "Tagalog (new - Filipino)",
+  tha: "Thai",
+  tir: "Tigrinya",
+  ton: "Tonga",
+  tur: "Turkish",
+  uig: "Uighur; Uyghur",
+  ukr: "Ukrainian",
+  urd: "Urdu",
+  uzb: "Uzbek",
+  uzb_cyrl: "Uzbek - Cyrilic",
+  vie: "Vietnamese",
+  yid: "Yiddish",
+  yor: "Yoruba",
+};
+
+module.exports.VALID_LANGUAGE_CODES = VALID_LANGUAGE_CODES;
diff --git a/docker/.env.example b/docker/.env.example
@@ -321,3 +321,8 @@ GID='1000'
 # Enable simple SSO passthrough to pre-authenticate users from a third party service.
 # See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
 # SIMPLE_SSO_ENABLED=1
+
+# Specify the target languages for when using OCR to parse images and PDFs.
+# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
+# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
+# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol
diff --git a/server/.env.example b/server/.env.example
@@ -310,3 +310,8 @@ TTS_PROVIDER="native"
 # Enable simple SSO passthrough to pre-authenticate users from a third party service.
 # See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
 # SIMPLE_SSO_ENABLED=1
+
+# Specify the target languages for when using OCR to parse images and PDFs.
+# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
+# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
+# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol
diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js
@@ -20,6 +20,9 @@ class CollectorApi {
       whisperProvider: process.env.WHISPER_PROVIDER || "local",
       WhisperModelPref: process.env.WHISPER_MODEL_PREF,
       openAiKey: process.env.OPEN_AI_KEY || null,
+      ocr: {
+        langList: process.env.TARGET_OCR_LANG || "eng",
+      },
     };
   }
 

diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
@@ -978,6 +978,9 @@ function dumpENV() {
 
     // Nvidia NIM Keys that are automatically managed
     "NVIDIA_NIM_LLM_MODEL_TOKEN_LIMIT",
+
+    // OCR Language Support
+    "TARGET_OCR_LANG",
   ];
 
   // Simple sanitization of each value to prevent ENV injection via newline or quote escaping.