Skip to content

Commit

Permalink
feat: Add multilingual support for ocr module (#3325)
Browse files Browse the repository at this point in the history
* Add multilingual support for ocr mudule

* Add OCR langauge as server var that is passed into Collector
Support all valid tesseract language codes
Filter and parse only valid codes with fallbacks'

* persist TARGET_OCR_LANG

* update docker example env

---------

Co-authored-by: Timothy Carambat <[email protected]>
  • Loading branch information
doodle777 and timothycarambat authored Feb 27, 2025
1 parent c928d3d commit df166eb
Show file tree
Hide file tree
Showing 8 changed files with 229 additions and 7 deletions.
6 changes: 4 additions & 2 deletions collector/processSingleFile/convert/asImage.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ const {
const OCRLoader = require("../../utils/OCRLoader");
const { default: slugify } = require("slugify");

async function asImage({ fullFilePath = "", filename = "" }) {
let content = await new OCRLoader().ocrImage(fullFilePath);
async function asImage({ fullFilePath = "", filename = "", options = {} }) {
let content = await new OCRLoader({
targetLanguages: options?.ocr?.langList,
}).ocrImage(fullFilePath);

if (!content?.length) {
console.error(`Resulting text content was empty for ${filename}.`);
Expand Down
6 changes: 4 additions & 2 deletions collector/processSingleFile/convert/asPDF/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ const { default: slugify } = require("slugify");
const PDFLoader = require("./PDFLoader");
const OCRLoader = require("../../../utils/OCRLoader");

async function asPdf({ fullFilePath = "", filename = "" }) {
async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
const pdfLoader = new PDFLoader(fullFilePath, {
splitPages: true,
});
Expand All @@ -22,7 +22,9 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
console.log(
`[asPDF] No text content found for ${filename}. Will attempt OCR parse.`
);
docs = await new OCRLoader().ocrPDF(fullFilePath);
docs = await new OCRLoader({
targetLanguages: options?.ocr?.langList,
}).ocrPDF(fullFilePath);
}

for (const doc of docs) {
Expand Down
53 changes: 50 additions & 3 deletions collector/utils/OCRLoader/index.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,61 @@
const fs = require("fs");
const os = require("os");
const path = require("path");
const { VALID_LANGUAGE_CODES } = require("./validLangs");

class OCRLoader {
constructor() {
/**
* The language code(s) to use for the OCR.
* @type {string[]}
*/
language;
/**
* The cache directory for the OCR.
* @type {string}
*/
cacheDir;

/**
* The constructor for the OCRLoader.
* @param {Object} options - The options for the OCRLoader.
* @param {string} options.targetLanguages - The target languages to use for the OCR as a comma separated string. eg: "eng,deu,..."
*/
constructor({ targetLanguages = "eng" } = {}) {
this.language = this.parseLanguages(targetLanguages);
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
: path.resolve(__dirname, `../../../server/storage/models/tesseract`)
);

// Ensure the cache directory exists or else Tesseract will persist the cache in the default location.
if (!fs.existsSync(this.cacheDir))
fs.mkdirSync(this.cacheDir, { recursive: true });
this.log(
`OCRLoader initialized with language support for:`,
this.language.map((lang) => VALID_LANGUAGE_CODES[lang]).join(", ")
);
}

/**
* Parses the language code from a provided comma separated string of language codes.
* @param {string} language - The language code to parse.
* @returns {string[]} The parsed language code.
*/
parseLanguages(language = null) {
try {
if (!language || typeof language !== "string") return ["eng"];
const langList = language
.split(",")
.map((lang) => (lang.trim() !== "" ? lang.trim() : null))
.filter(Boolean)
.filter((lang) => VALID_LANGUAGE_CODES.hasOwnProperty(lang));
if (langList.length === 0) return ["eng"];
return langList;
} catch (e) {
this.log(`Error parsing languages: ${e.message}`, e.stack);
return ["eng"];
}
}

log(text, ...args) {
Expand Down Expand Up @@ -70,7 +117,7 @@ class OCRLoader {
Array(NUM_WORKERS)
.fill(0)
.map(() =>
createWorker("eng", OEM.LSTM_ONLY, {
createWorker(this.language, OEM.LSTM_ONLY, {
cachePath: this.cacheDir,
})
)
Expand Down Expand Up @@ -188,7 +235,7 @@ class OCRLoader {
this.log(`Starting OCR of ${documentTitle}`);
const startTime = Date.now();
const { createWorker, OEM } = require("tesseract.js");
worker = await createWorker("eng", OEM.LSTM_ONLY, {
worker = await createWorker(this.language, OEM.LSTM_ONLY, {
cachePath: this.cacheDir,
});

Expand Down
155 changes: 155 additions & 0 deletions collector/utils/OCRLoader/validLangs.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
To get the list of valid language codes - do the following:
Open the following URL in your browser: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
Check this element is the proper table tbody with all the codes via console:
document.getElementsByTagName('table').item(0).children.item(1)
Now, copy the following code and paste it into the console:
function parseLangs() {
let langs = {};
Array.from(document.getElementsByTagName('table').item(0).children.item(1).children).forEach((el) => {
const [codeEl, languageEl, ...rest] = el.children
const code = codeEl.innerText.trim()
const language = languageEl.innerText.trim()
if (!!code && !!language) langs[code] = language
})
return langs;
}
now, run the function:
copy(parseLangs())
*/

const VALID_LANGUAGE_CODES = {
afr: "Afrikaans",
amh: "Amharic",
ara: "Arabic",
asm: "Assamese",
aze: "Azerbaijani",
aze_cyrl: "Azerbaijani - Cyrilic",
bel: "Belarusian",
ben: "Bengali",
bod: "Tibetan",
bos: "Bosnian",
bre: "Breton",
bul: "Bulgarian",
cat: "Catalan; Valencian",
ceb: "Cebuano",
ces: "Czech",
chi_sim: "Chinese - Simplified",
chi_tra: "Chinese - Traditional",
chr: "Cherokee",
cos: "Corsican",
cym: "Welsh",
dan: "Danish",
dan_frak: "Danish - Fraktur (contrib)",
deu: "German",
deu_frak: "German - Fraktur (contrib)",
deu_latf: "German (Fraktur Latin)",
dzo: "Dzongkha",
ell: "Greek, Modern (1453-)",
eng: "English",
enm: "English, Middle (1100-1500)",
epo: "Esperanto",
equ: "Math / equation detection module",
est: "Estonian",
eus: "Basque",
fao: "Faroese",
fas: "Persian",
fil: "Filipino (old - Tagalog)",
fin: "Finnish",
fra: "French",
frk: "German - Fraktur (now deu_latf)",
frm: "French, Middle (ca.1400-1600)",
fry: "Western Frisian",
gla: "Scottish Gaelic",
gle: "Irish",
glg: "Galician",
grc: "Greek, Ancient (to 1453) (contrib)",
guj: "Gujarati",
hat: "Haitian; Haitian Creole",
heb: "Hebrew",
hin: "Hindi",
hrv: "Croatian",
hun: "Hungarian",
hye: "Armenian",
iku: "Inuktitut",
ind: "Indonesian",
isl: "Icelandic",
ita: "Italian",
ita_old: "Italian - Old",
jav: "Javanese",
jpn: "Japanese",
kan: "Kannada",
kat: "Georgian",
kat_old: "Georgian - Old",
kaz: "Kazakh",
khm: "Central Khmer",
kir: "Kirghiz; Kyrgyz",
kmr: "Kurmanji (Kurdish - Latin Script)",
kor: "Korean",
kor_vert: "Korean (vertical)",
kur: "Kurdish (Arabic Script)",
lao: "Lao",
lat: "Latin",
lav: "Latvian",
lit: "Lithuanian",
ltz: "Luxembourgish",
mal: "Malayalam",
mar: "Marathi",
mkd: "Macedonian",
mlt: "Maltese",
mon: "Mongolian",
mri: "Maori",
msa: "Malay",
mya: "Burmese",
nep: "Nepali",
nld: "Dutch; Flemish",
nor: "Norwegian",
oci: "Occitan (post 1500)",
ori: "Oriya",
osd: "Orientation and script detection module",
pan: "Panjabi; Punjabi",
pol: "Polish",
por: "Portuguese",
pus: "Pushto; Pashto",
que: "Quechua",
ron: "Romanian; Moldavian; Moldovan",
rus: "Russian",
san: "Sanskrit",
sin: "Sinhala; Sinhalese",
slk: "Slovak",
slk_frak: "Slovak - Fraktur (contrib)",
slv: "Slovenian",
snd: "Sindhi",
spa: "Spanish; Castilian",
spa_old: "Spanish; Castilian - Old",
sqi: "Albanian",
srp: "Serbian",
srp_latn: "Serbian - Latin",
sun: "Sundanese",
swa: "Swahili",
swe: "Swedish",
syr: "Syriac",
tam: "Tamil",
tat: "Tatar",
tel: "Telugu",
tgk: "Tajik",
tgl: "Tagalog (new - Filipino)",
tha: "Thai",
tir: "Tigrinya",
ton: "Tonga",
tur: "Turkish",
uig: "Uighur; Uyghur",
ukr: "Ukrainian",
urd: "Urdu",
uzb: "Uzbek",
uzb_cyrl: "Uzbek - Cyrilic",
vie: "Vietnamese",
yid: "Yiddish",
yor: "Yoruba",
};

module.exports.VALID_LANGUAGE_CODES = VALID_LANGUAGE_CODES;
5 changes: 5 additions & 0 deletions docker/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,8 @@ GID='1000'
# Enable simple SSO passthrough to pre-authenticate users from a third party service.
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
# SIMPLE_SSO_ENABLED=1

# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol
5 changes: 5 additions & 0 deletions server/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,8 @@ TTS_PROVIDER="native"
# Enable simple SSO passthrough to pre-authenticate users from a third party service.
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
# SIMPLE_SSO_ENABLED=1

# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol
3 changes: 3 additions & 0 deletions server/utils/collectorApi/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ class CollectorApi {
whisperProvider: process.env.WHISPER_PROVIDER || "local",
WhisperModelPref: process.env.WHISPER_MODEL_PREF,
openAiKey: process.env.OPEN_AI_KEY || null,
ocr: {
langList: process.env.TARGET_OCR_LANG || "eng",
},
};
}

Expand Down
3 changes: 3 additions & 0 deletions server/utils/helpers/updateENV.js
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,9 @@ function dumpENV() {

// Nvidia NIM Keys that are automatically managed
"NVIDIA_NIM_LLM_MODEL_TOKEN_LIMIT",

// OCR Language Support
"TARGET_OCR_LANG",
];

// Simple sanitization of each value to prevent ENV injection via newline or quote escaping.
Expand Down

0 comments on commit df166eb

Please sign in to comment.