diff --git a/imixs-archive-documents/README.md b/imixs-archive-documents/README.md index fb7ee34d..8c10ad1a 100644 --- a/imixs-archive-documents/README.md +++ b/imixs-archive-documents/README.md @@ -45,9 +45,28 @@ Both, the *OCRDocumentPlugin* as also the *OCRDocumentAdapter* can be configured X-Tika-PDFocrStrategy=OCR_AND_TEXT_EXTRACTION X-Tika-PDFOcrImageType=RGB - X-Tika-PDFOcrDPI=400 + X-Tika-PDFOcrDPI=72 + X-Tika-OCRLanguage=eng+deu -In this example configuration the OCR processing will be started with 3 additional tika options. For more details about the OCR configuration see the [Imixs-Archive-OCR project](https://github.com/imixs/imixs-archive/tree/master/imixs-archive-ocr). +In this example configuration the OCR processing will be started with 4 additional tika options. + + - X-Tika-PDFOcrImageType=RGB - set color mode + - X-Tika-PDFOcrDPI=72 - set DPI to 72 + - X-Tika-OCRLanguage=deu - set OCR language to german + + +#### Overriding the configured language as part of your request + +Different requests may need processing using different language models. These can be specified for specific requests using the X-Tika-OCRLanguage custom header. An example of this is shown below: + + X-Tika-OCRLanguage=deu + +Or for multiple languages: + + X-Tika-OCRLanguage: eng+fra" + + +For more details about the OCR configuration see the [Imixs-Archive-OCR project](https://github.com/imixs/imixs-archive/tree/master/imixs-archive-ocr). ## Searching Documents diff --git a/imixs-archive-ocr/README.md b/imixs-archive-ocr/README.md index ba652ef1..aa60add2 100644 --- a/imixs-archive-ocr/README.md +++ b/imixs-archive-ocr/README.md @@ -1,7 +1,6 @@ # Imixs-Archive-OCR -*Imixs-Archive-OCR* is a sub-project of Imixs-Archive. The project provides methods to extract textual information from documents -attached to a Workitem. The text content of attachments is either extracted by the PDFBox API or by optical character recognition (OCR). This text content is stored in the $file attribute 'text' and can be use for further processing or to search for document content. +*Imixs-Archive-OCR* is a sub-project of Imixs-Archive. The project is decoupled form the Imixs-Workflow Engine and provides a service component to extract textual information from documents attached to a Workitem. The text content of attachments is either extracted by the PDFBox API or by optical character recognition (OCR). This text content is stored in the $file attribute 'text' and can be use for further processing or to search for document content. ## OCR @@ -43,9 +42,9 @@ For example to set the DPI mode call: // define options List options=new ArrayList(); options.add("X-Tika-PDFocrStrategy=OCR_AND_TEXT_EXTRACTION"); - options.add("X-Tika-PDFOcrImageType=RGB"); - options.add("X-Tika-PDFOcrDPI=400"); - + options.add("X-Tika-PDFOcrImageType=RGB"); // support colors + options.add("X-Tika-PDFOcrDPI=72"); // set DPI + options.add("X-Tika-OCRLanguage=eng"); // set english language // start ocr tikaDocumentService.extractText(workitem, "TEXT_AND_OCR", options) @@ -53,7 +52,9 @@ For example to set the DPI mode call: You have various options to configure the Tika server. Find details about how to configure imixs-tika [here](https://github.com/imixs/imixs-docker/tree/master/tika). - + - https://cwiki.apache.org/confluence/display/TIKA/TikaServer + - https://cwiki.apache.org/confluence/display/TIKA/TikaOCR + - https://cwiki.apache.org/confluence/display/tika/PDFParser%20(Apache%20PDFBox) ## How to Install diff --git a/imixs-archive-ocr/src/main/java/org/imixs/archive/ocr/OCRService.java b/imixs-archive-ocr/src/main/java/org/imixs/archive/ocr/OCRService.java index 13255e9a..bcf48a5e 100644 --- a/imixs-archive-ocr/src/main/java/org/imixs/archive/ocr/OCRService.java +++ b/imixs-archive-ocr/src/main/java/org/imixs/archive/ocr/OCRService.java @@ -122,7 +122,7 @@ public void extractText(ItemCollection workitem, ItemCollection snapshot, String // validate OCR MODE.... if ("TEXT_ONLY, OCR_ONLY, TEXT_AND_OCR".indexOf(pdfMode) == -1) { throw new PluginException(OCRService.class.getSimpleName(), PLUGIN_ERROR, - "Invalid TIKA_OCR_MODE - exprected one of the following options: TEXT_ONLY | OCR_ONLY | TEXT_AND_OCR"); + "Invalid TIKA_OCR_MODE - expected one of the following options: TEXT_ONLY | OCR_ONLY | TEXT_AND_OCR"); } long l = System.currentTimeMillis();