From 16c63d62e7e150fcb20f4fc87285ccd6b1608ad1 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 11 Apr 2024 17:44:09 -0400 Subject: [PATCH] Update OCR Plugin features and add new localization strings (#21) * Update OCR Plugin features and add new localization strings * Refactor OCR filter properties callback in ocr-filter.cpp and include necessary headers in text-render-helper.cpp --- CMakeLists.txt | 2 +- data/locale/en-US.ini | 4 +++ src/consts.h | 4 +++ src/filter-data.h | 1 + src/obs-utils.cpp | 17 ++++++++--- src/obs-utils.h | 5 ++- src/ocr-filter.cpp | 34 +++++++++++++++++++++ src/tesseract-ocr-utils.cpp | 61 ++++++++++++++++++++++++++----------- src/tesseract-ocr-utils.h | 7 ++++- src/text-render-helper.cpp | 38 +++++++++++++++++++++++ src/text-render-helper.h | 14 +++++++++ 11 files changed, 163 insertions(+), 24 deletions(-) create mode 100644 src/text-render-helper.cpp create mode 100644 src/text-render-helper.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f00b20..2b6f678 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,6 @@ include(cmake/BuildInja.cmake) target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE inja) target_sources(${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c src/obs-utils.cpp src/tesseract-ocr-utils.cpp - src/ocr-filter.cpp src/ocr-filter-info.c) + src/ocr-filter.cpp src/ocr-filter-info.c src/text-render-helper.cpp) set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name}) diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index a52c56b..dbff779 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -24,3 +24,7 @@ PreviewBinarization="Preview Binarization" RescaleImage="Rescale Image" RescaleTargetSize="Rescale Target Size" DilationIterations="Dilation Iterations" +ImageOutputOption="Image Output Option" +DetectionBoxesMask="Detection Boxes Mask" +TextRendering="Text Overlay" +TextWithBackground="Text with Background" diff --git a/src/consts.h b/src/consts.h index 304dcaf..d6a7c47 100644 --- a/src/consts.h +++ b/src/consts.h @@ -27,4 +27,8 @@ const char *const WHITELIST_CHARS_PORTUGUESE = const char *const WHITELIST_CHARS_RUSSIAN = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()_+-=[]{}|;':\",./<>?`~\\абвгдеёжзийклмнопрстуфхцчшщъыьэюя "; +const int OUTPUT_IMAGE_OPTION_DETECTION_MASK = 0; +const int OUTPUT_IMAGE_OPTION_TEXT_OVERLAY = 1; +const int OUTPUT_IMAGE_OPTION_TEXT_BACKGROUND = 2; + #endif /* CONSTS_H */ diff --git a/src/filter-data.h b/src/filter-data.h index d0e563a..c7a53a3 100644 --- a/src/filter-data.h +++ b/src/filter-data.h @@ -53,6 +53,7 @@ struct filter_data { std::string output_format_template; bool update_on_change; int update_on_change_threshold; + int output_image_option; bool isDisabled; diff --git a/src/obs-utils.cpp b/src/obs-utils.cpp index 4d01587..5bbe3f3 100644 --- a/src/obs-utils.cpp +++ b/src/obs-utils.cpp @@ -159,9 +159,9 @@ void setTextCallback(const std::string &str, struct filter_data *usd) obs_source_release(target); }; -void setTextDetectionMaskCallback(const cv::Mat &mask, struct filter_data *usd) +void setTextDetectionMaskCallback(const cv::Mat &mask_rgba, struct filter_data *usd) { - UNUSED_PARAMETER(mask); + UNUSED_PARAMETER(mask_rgba); if (!usd->output_source_mutex) { obs_log(LOG_ERROR, "output_source_mutex is null"); return; @@ -191,7 +191,7 @@ void setTextDetectionMaskCallback(const cv::Mat &mask, struct filter_data *usd) std::string config_folder = obs_module_config_path(""); std::string filename = config_folder + "/" + usd->unique_id + ".png"; // write the file - write_png_file(filename.c_str(), mask.data, mask.cols, mask.rows); + write_png_file_rgba(filename.c_str(), mask_rgba.data, mask_rgba.cols, mask_rgba.rows); // set the image source settings auto image_settings = obs_source_get_settings(target); @@ -301,9 +301,18 @@ void check_plugin_config_folder_exists() } } -void write_png_file(const char *filename, const unsigned char *image8uc1, int width, int height) +void write_png_file_8uc1(const char *filename, const unsigned char *image8uc1, int width, + int height) { QImage image(image8uc1, width, height, QImage::Format_Grayscale8); QString qfilename(filename); image.save(qfilename); } + +void write_png_file_rgba(const char *filename, const unsigned char *imageRGBA, int width, + int height) +{ + QImage image(imageRGBA, width, height, QImage::Format_RGBA8888); + QString qfilename(filename); + image.save(qfilename); +} diff --git a/src/obs-utils.h b/src/obs-utils.h index bd01414..b0d129a 100644 --- a/src/obs-utils.h +++ b/src/obs-utils.h @@ -25,6 +25,9 @@ void update_image_source_on_settings(struct filter_data *usd, obs_data_t *settin void check_plugin_config_folder_exists(); -void write_png_file(const char *filename, const unsigned char *image8uc3, int width, int height); +void write_png_file_8uc1(const char *filename, const unsigned char *image8uc3, int width, + int height); +void write_png_file_rgba(const char *filename, const unsigned char *imageRGBA, int width, + int height); #endif /* OBS_UTILS_H */ diff --git a/src/ocr-filter.cpp b/src/ocr-filter.cpp index d68206c..a36bf5b 100644 --- a/src/ocr-filter.cpp +++ b/src/ocr-filter.cpp @@ -225,6 +225,8 @@ obs_properties_t *ocr_filter_properties(void *data) // Output formatting obs_properties_add_text(props, "output_formatting", obs_module_text("OutputFormatting"), OBS_TEXT_MULTILINE); + // hide the output formatting property by default + obs_property_set_visible(obs_properties_get(props, "output_formatting"), false); // Add a property for the output text source obs_property_t *text_sources = @@ -253,6 +255,12 @@ obs_properties_t *ocr_filter_properties(void *data) obs_property_set_visible(obs_properties_get(props_modified, "output_file_path"), save_to_file); + // show/hide "output_formatting" property based on the selected output source + bool show_output_formatting = + strcmp(obs_data_get_string(settings, "text_sources"), "none") != 0; + obs_property_set_visible(obs_properties_get(props_modified, + "output_formatting"), + show_output_formatting); UNUSED_PARAMETER(property); return true; }); @@ -267,6 +275,30 @@ obs_properties_t *ocr_filter_properties(void *data) obs_property_list_add_string(image_sources, obs_module_text("NoOutput"), "none"); // Add the sources obs_enum_sources(add_image_sources_to_list, image_sources); + // add change callback for the image sources + obs_property_set_modified_callback( + obs_properties_get(props, "text_detection_mask_sources"), + [](obs_properties_t *props_modified, obs_property_t *, obs_data_t *settings) { + // hide/show the image_output_option property based on the selected image source + bool show_image_output_option = + strcmp(obs_data_get_string(settings, "text_detection_mask_sources"), + "none") != 0; + obs_property_set_visible(obs_properties_get(props_modified, + "image_output_option"), + show_image_output_option); + return true; + }); + + // add a choice for the image output format: detection boxes mask, text rendering, or text with background + obs_property_t *output_format = obs_properties_add_list( + props, "image_output_option", obs_module_text("ImageOutputOption"), + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); + obs_property_list_add_int(output_format, obs_module_text("DetectionBoxesMask"), + OUTPUT_IMAGE_OPTION_DETECTION_MASK); + obs_property_list_add_int(output_format, obs_module_text("TextRendering"), + OUTPUT_IMAGE_OPTION_TEXT_OVERLAY); + obs_property_list_add_int(output_format, obs_module_text("TextWithBackground"), + OUTPUT_IMAGE_OPTION_TEXT_BACKGROUND); // Add a informative text about the plugin obs_properties_add_text( @@ -302,6 +334,7 @@ void ocr_filter_defaults(obs_data_t *settings) obs_data_set_default_int(settings, "word_length", 5); obs_data_set_default_int(settings, "window_size", 10); obs_data_set_default_string(settings, "output_formatting", "{{output}}"); + obs_data_set_default_int(settings, "image_output_option", 0); } void ocr_filter_update(void *data, obs_data_t *settings) @@ -340,6 +373,7 @@ void ocr_filter_update(void *data, obs_data_t *settings) tf->update_on_change = obs_data_get_bool(settings, "update_on_change"); tf->update_on_change_threshold = (int)obs_data_get_int(settings, "update_on_change_threshold"); + tf->output_image_option = (int)obs_data_get_int(settings, "image_output_option"); // Initialize the Tesseract OCR model initialize_tesseract_ocr(tf, hard_tesseract_init_required); diff --git a/src/tesseract-ocr-utils.cpp b/src/tesseract-ocr-utils.cpp index 8615fd2..11be005 100644 --- a/src/tesseract-ocr-utils.cpp +++ b/src/tesseract-ocr-utils.cpp @@ -1,6 +1,8 @@ #include "tesseract-ocr-utils.h" #include "plugin-support.h" #include "obs-utils.h" +#include "consts.h" +#include "text-render-helper.h" #include @@ -179,19 +181,18 @@ std::string run_tesseract_ocr(filter_data *tf, const cv::Mat &image) return recognitionResult; } -std::vector> extract_text_detection_boxes(filter_data *tf, - cv::Size imageSize) +std::vector extract_text_detection_boxes(filter_data *tf, cv::Size imageSize) { // extract the text detection boxes tesseract::ResultIterator *ri = tf->tesseract_model->GetIterator(); if (ri == nullptr) { - return std::vector>(); + return std::vector(); } tesseract::PageIteratorLevel level = tesseract::RIL_WORD; if (tf->pageSegmentationMode == tesseract::PSM_SINGLE_CHAR) { level = tesseract::RIL_SYMBOL; } - std::vector> boxes; + std::vector boxes; do { if (ri->Empty(level)) { continue; @@ -204,13 +205,13 @@ std::vector> extract_text_detection_boxes(filter_data *tf continue; } } - std::vector box(4); + OCRBox box; int left, top, right, bottom; ri->BoundingBox(level, &left, &top, &right, &bottom); - box[0] = cv::Point(left, top); - box[1] = cv::Point(right, top); - box[2] = cv::Point(right, bottom); - box[3] = cv::Point(left, bottom); + box.box = cv::Rect(left, top, right - left, bottom - top); + // get the text of the box + const char *text = ri->GetUTF8Text(level); + box.text = text; // get area of box const int area = (right - left) * (bottom - top); // if the area is too small or too big, relative to the image size - skip the box @@ -415,19 +416,45 @@ void tesseract_thread(void *data) std::string ocr_result = run_tesseract_ocr(tf, imageForOCR); if (is_valid_output_source_name(tf->output_image_source_name)) { + cv::Mat text_detection_output(imageBGRA.rows, + imageBGRA.cols, CV_8UC4, + cv::Scalar(0, 0, 0, 0)); + // Extract the text detection boxes - std::vector> boxes = + std::vector boxes = extract_text_detection_boxes(tf, imageBGRA.size()); - // Create a text detection binary mask - cv::Mat text_detection_mask(imageBGRA.rows, imageBGRA.cols, - CV_8UC1, cv::Scalar(0)); - for (const std::vector &box : boxes) { - cv::fillConvexPoly(text_detection_mask, box, - cv::Scalar(255)); + if (tf->output_image_option == + OUTPUT_IMAGE_OPTION_TEXT_OVERLAY) { + // Create a text overlay image + QImage text_overlay_image = + render_boxes_with_qtextdocument( + boxes, imageBGRA.cols, + imageBGRA.rows); + cv::Mat text_overlay_image_mat( + text_overlay_image.height(), + text_overlay_image.width(), CV_8UC4, + text_overlay_image.bits(), + text_overlay_image.bytesPerLine()); + text_overlay_image_mat.copyTo( + text_detection_output); + // } else if (tf->output_image_option == + // OUTPUT_IMAGE_OPTION_TEXT_BACKGROUND) { + // // Draw the text detection boxes on the image with a background + + } else { + text_detection_output.setTo( + cv::Scalar(0, 0, 0, 255)); + + // Create a text detection binary mask + for (const auto &box : boxes) { + cv::rectangle( + text_detection_output, box.box, + cv::Scalar(255, 255, 255, 255), -1); + } } - setTextDetectionMaskCallback(text_detection_mask, tf); + setTextDetectionMaskCallback(text_detection_output, tf); } if (!ocr_result.empty() && diff --git a/src/tesseract-ocr-utils.h b/src/tesseract-ocr-utils.h index 3058bf3..7372a29 100644 --- a/src/tesseract-ocr-utils.h +++ b/src/tesseract-ocr-utils.h @@ -6,10 +6,15 @@ #include #include +struct OCRBox { + std::string text; + cv::Rect box; +}; + void cleanup_config_files(const std::string &unique_id); void initialize_tesseract_ocr(filter_data *tf, bool hard_tesseract_init_required = false); std::string run_tesseract_ocr(filter_data *tf, const cv::Mat &imageBGRA); -std::vector> extract_text_detection_boxes(filter_data *tf); +std::vector extract_text_detection_boxes(filter_data *tf); std::string strip(const std::string &str); void stop_and_join_tesseract_thread(struct filter_data *tf); void tesseract_thread(void *data); diff --git a/src/text-render-helper.cpp b/src/text-render-helper.cpp new file mode 100644 index 0000000..ffda1bb --- /dev/null +++ b/src/text-render-helper.cpp @@ -0,0 +1,38 @@ +#include "text-render-helper.h" + +#include +#include + +/** + * Render text to a buffer using QTextDocument + * @param text Text to render + * @param width Output width + * @param height Output height + * @param data Output buffer, user must free + * @param css_props CSS properties to apply to the text + */ +QImage render_boxes_with_qtextdocument(const std::vector &boxes, uint32_t width, + uint32_t height) +{ + QPixmap pixmap(width, height); + pixmap.fill(Qt::transparent); + QPainter painter; + painter.begin(&pixmap); + painter.setCompositionMode(QPainter::CompositionMode_Source); + + // draw individual boxes on the pixmap + for (const OCRBox &box : boxes) { + painter.setPen(Qt::blue); + // set the character size according to the box height + QFont font = painter.font(); + font.setPixelSize(box.box.height); + painter.setFont(font); + painter.drawText(box.box.x, box.box.y + box.box.height, + QString::fromStdString(box.text)); + } + + painter.setCompositionMode(QPainter::CompositionMode_DestinationIn); + painter.end(); + + return pixmap.toImage(); +} diff --git a/src/text-render-helper.h b/src/text-render-helper.h new file mode 100644 index 0000000..e299d1a --- /dev/null +++ b/src/text-render-helper.h @@ -0,0 +1,14 @@ +#ifndef TEXT_RENDER_HELPER_H +#define TEXT_RENDER_HELPER_H + +#include "tesseract-ocr-utils.h" + +#include +#include + +#include + +QImage render_boxes_with_qtextdocument(const std::vector &boxes, uint32_t width, + uint32_t height); + +#endif // TEXT_RENDER_HELPER_H