From 16c63d62e7e150fcb20f4fc87285ccd6b1608ad1 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 11 Apr 2024 17:44:09 -0400
Subject: [PATCH] Update OCR Plugin features and add new localization strings
 (#21)

* Update OCR Plugin features and add new localization strings

* Refactor OCR filter properties callback in ocr-filter.cpp and include necessary headers in text-render-helper.cpp
---
 CMakeLists.txt              |  2 +-
 data/locale/en-US.ini       |  4 +++
 src/consts.h                |  4 +++
 src/filter-data.h           |  1 +
 src/obs-utils.cpp           | 17 ++++++++---
 src/obs-utils.h             |  5 ++-
 src/ocr-filter.cpp          | 34 +++++++++++++++++++++
 src/tesseract-ocr-utils.cpp | 61 ++++++++++++++++++++++++++-----------
 src/tesseract-ocr-utils.h   |  7 ++++-
 src/text-render-helper.cpp  | 38 +++++++++++++++++++++++
 src/text-render-helper.h    | 14 +++++++++
 11 files changed, 163 insertions(+), 24 deletions(-)
 create mode 100644 src/text-render-helper.cpp
 create mode 100644 src/text-render-helper.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0f00b20..2b6f678 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,6 @@ include(cmake/BuildInja.cmake)
 target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE inja)
 
 target_sources(${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c src/obs-utils.cpp src/tesseract-ocr-utils.cpp
-                                             src/ocr-filter.cpp src/ocr-filter-info.c)
+                                             src/ocr-filter.cpp src/ocr-filter-info.c src/text-render-helper.cpp)
 
 set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
index a52c56b..dbff779 100644
--- a/data/locale/en-US.ini
+++ b/data/locale/en-US.ini
@@ -24,3 +24,7 @@ PreviewBinarization="Preview Binarization"
 RescaleImage="Rescale Image"
 RescaleTargetSize="Rescale Target Size"
 DilationIterations="Dilation Iterations"
+ImageOutputOption="Image Output Option"
+DetectionBoxesMask="Detection Boxes Mask"
+TextRendering="Text Overlay"
+TextWithBackground="Text with Background"
diff --git a/src/consts.h b/src/consts.h
index 304dcaf..d6a7c47 100644
--- a/src/consts.h
+++ b/src/consts.h
@@ -27,4 +27,8 @@ const char *const WHITELIST_CHARS_PORTUGUESE =
 const char *const WHITELIST_CHARS_RUSSIAN =
 	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()_+-=[]{}|;':\",./<>?`~\\абвгдеёжзийклмнопрстуфхцчшщъыьэюя ";
 
+const int OUTPUT_IMAGE_OPTION_DETECTION_MASK = 0;
+const int OUTPUT_IMAGE_OPTION_TEXT_OVERLAY = 1;
+const int OUTPUT_IMAGE_OPTION_TEXT_BACKGROUND = 2;
+
 #endif /* CONSTS_H */
diff --git a/src/filter-data.h b/src/filter-data.h
index d0e563a..c7a53a3 100644
--- a/src/filter-data.h
+++ b/src/filter-data.h
@@ -53,6 +53,7 @@ struct filter_data {
 	std::string output_format_template;
 	bool update_on_change;
 	int update_on_change_threshold;
+	int output_image_option;
 
 	bool isDisabled;
 
diff --git a/src/obs-utils.cpp b/src/obs-utils.cpp
index 4d01587..5bbe3f3 100644
--- a/src/obs-utils.cpp
+++ b/src/obs-utils.cpp
@@ -159,9 +159,9 @@ void setTextCallback(const std::string &str, struct filter_data *usd)
 	obs_source_release(target);
 };
 
-void setTextDetectionMaskCallback(const cv::Mat &mask, struct filter_data *usd)
+void setTextDetectionMaskCallback(const cv::Mat &mask_rgba, struct filter_data *usd)
 {
-	UNUSED_PARAMETER(mask);
+	UNUSED_PARAMETER(mask_rgba);
 	if (!usd->output_source_mutex) {
 		obs_log(LOG_ERROR, "output_source_mutex is null");
 		return;
@@ -191,7 +191,7 @@ void setTextDetectionMaskCallback(const cv::Mat &mask, struct filter_data *usd)
 	std::string config_folder = obs_module_config_path("");
 	std::string filename = config_folder + "/" + usd->unique_id + ".png";
 	// write the file
-	write_png_file(filename.c_str(), mask.data, mask.cols, mask.rows);
+	write_png_file_rgba(filename.c_str(), mask_rgba.data, mask_rgba.cols, mask_rgba.rows);
 
 	// set the image source settings
 	auto image_settings = obs_source_get_settings(target);
@@ -301,9 +301,18 @@ void check_plugin_config_folder_exists()
 	}
 }
 
-void write_png_file(const char *filename, const unsigned char *image8uc1, int width, int height)
+void write_png_file_8uc1(const char *filename, const unsigned char *image8uc1, int width,
+			 int height)
 {
 	QImage image(image8uc1, width, height, QImage::Format_Grayscale8);
 	QString qfilename(filename);
 	image.save(qfilename);
 }
+
+void write_png_file_rgba(const char *filename, const unsigned char *imageRGBA, int width,
+			 int height)
+{
+	QImage image(imageRGBA, width, height, QImage::Format_RGBA8888);
+	QString qfilename(filename);
+	image.save(qfilename);
+}
diff --git a/src/obs-utils.h b/src/obs-utils.h
index bd01414..b0d129a 100644
--- a/src/obs-utils.h
+++ b/src/obs-utils.h
@@ -25,6 +25,9 @@ void update_image_source_on_settings(struct filter_data *usd, obs_data_t *settin
 
 void check_plugin_config_folder_exists();
 
-void write_png_file(const char *filename, const unsigned char *image8uc3, int width, int height);
+void write_png_file_8uc1(const char *filename, const unsigned char *image8uc3, int width,
+			 int height);
+void write_png_file_rgba(const char *filename, const unsigned char *imageRGBA, int width,
+			 int height);
 
 #endif /* OBS_UTILS_H */
diff --git a/src/ocr-filter.cpp b/src/ocr-filter.cpp
index d68206c..a36bf5b 100644
--- a/src/ocr-filter.cpp
+++ b/src/ocr-filter.cpp
@@ -225,6 +225,8 @@ obs_properties_t *ocr_filter_properties(void *data)
 	// Output formatting
 	obs_properties_add_text(props, "output_formatting", obs_module_text("OutputFormatting"),
 				OBS_TEXT_MULTILINE);
+	// hide the output formatting property by default
+	obs_property_set_visible(obs_properties_get(props, "output_formatting"), false);
 
 	// Add a property for the output text source
 	obs_property_t *text_sources =
@@ -253,6 +255,12 @@ obs_properties_t *ocr_filter_properties(void *data)
 			obs_property_set_visible(obs_properties_get(props_modified,
 								    "output_file_path"),
 						 save_to_file);
+			// show/hide "output_formatting" property based on the selected output source
+			bool show_output_formatting =
+				strcmp(obs_data_get_string(settings, "text_sources"), "none") != 0;
+			obs_property_set_visible(obs_properties_get(props_modified,
+								    "output_formatting"),
+						 show_output_formatting);
 			UNUSED_PARAMETER(property);
 			return true;
 		});
@@ -267,6 +275,30 @@ obs_properties_t *ocr_filter_properties(void *data)
 	obs_property_list_add_string(image_sources, obs_module_text("NoOutput"), "none");
 	// Add the sources
 	obs_enum_sources(add_image_sources_to_list, image_sources);
+	// add change callback for the image sources
+	obs_property_set_modified_callback(
+		obs_properties_get(props, "text_detection_mask_sources"),
+		[](obs_properties_t *props_modified, obs_property_t *, obs_data_t *settings) {
+			// hide/show the image_output_option property based on the selected image source
+			bool show_image_output_option =
+				strcmp(obs_data_get_string(settings, "text_detection_mask_sources"),
+				       "none") != 0;
+			obs_property_set_visible(obs_properties_get(props_modified,
+								    "image_output_option"),
+						 show_image_output_option);
+			return true;
+		});
+
+	// add a choice for the image output format: detection boxes mask, text rendering, or text with background
+	obs_property_t *output_format = obs_properties_add_list(
+		props, "image_output_option", obs_module_text("ImageOutputOption"),
+		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
+	obs_property_list_add_int(output_format, obs_module_text("DetectionBoxesMask"),
+				  OUTPUT_IMAGE_OPTION_DETECTION_MASK);
+	obs_property_list_add_int(output_format, obs_module_text("TextRendering"),
+				  OUTPUT_IMAGE_OPTION_TEXT_OVERLAY);
+	obs_property_list_add_int(output_format, obs_module_text("TextWithBackground"),
+				  OUTPUT_IMAGE_OPTION_TEXT_BACKGROUND);
 
 	// Add a informative text about the plugin
 	obs_properties_add_text(
@@ -302,6 +334,7 @@ void ocr_filter_defaults(obs_data_t *settings)
 	obs_data_set_default_int(settings, "word_length", 5);
 	obs_data_set_default_int(settings, "window_size", 10);
 	obs_data_set_default_string(settings, "output_formatting", "{{output}}");
+	obs_data_set_default_int(settings, "image_output_option", 0);
 }
 
 void ocr_filter_update(void *data, obs_data_t *settings)
@@ -340,6 +373,7 @@ void ocr_filter_update(void *data, obs_data_t *settings)
 	tf->update_on_change = obs_data_get_bool(settings, "update_on_change");
 	tf->update_on_change_threshold =
 		(int)obs_data_get_int(settings, "update_on_change_threshold");
+	tf->output_image_option = (int)obs_data_get_int(settings, "image_output_option");
 
 	// Initialize the Tesseract OCR model
 	initialize_tesseract_ocr(tf, hard_tesseract_init_required);
diff --git a/src/tesseract-ocr-utils.cpp b/src/tesseract-ocr-utils.cpp
index 8615fd2..11be005 100644
--- a/src/tesseract-ocr-utils.cpp
+++ b/src/tesseract-ocr-utils.cpp
@@ -1,6 +1,8 @@
 #include "tesseract-ocr-utils.h"
 #include "plugin-support.h"
 #include "obs-utils.h"
+#include "consts.h"
+#include "text-render-helper.h"
 
 #include <obs-module.h>
 
@@ -179,19 +181,18 @@ std::string run_tesseract_ocr(filter_data *tf, const cv::Mat &image)
 	return recognitionResult;
 }
 
-std::vector<std::vector<cv::Point>> extract_text_detection_boxes(filter_data *tf,
-								 cv::Size imageSize)
+std::vector<OCRBox> extract_text_detection_boxes(filter_data *tf, cv::Size imageSize)
 {
 	// extract the text detection boxes
 	tesseract::ResultIterator *ri = tf->tesseract_model->GetIterator();
 	if (ri == nullptr) {
-		return std::vector<std::vector<cv::Point>>();
+		return std::vector<OCRBox>();
 	}
 	tesseract::PageIteratorLevel level = tesseract::RIL_WORD;
 	if (tf->pageSegmentationMode == tesseract::PSM_SINGLE_CHAR) {
 		level = tesseract::RIL_SYMBOL;
 	}
-	std::vector<std::vector<cv::Point>> boxes;
+	std::vector<OCRBox> boxes;
 	do {
 		if (ri->Empty(level)) {
 			continue;
@@ -204,13 +205,13 @@ std::vector<std::vector<cv::Point>> extract_text_detection_boxes(filter_data *tf
 				continue;
 			}
 		}
-		std::vector<cv::Point> box(4);
+		OCRBox box;
 		int left, top, right, bottom;
 		ri->BoundingBox(level, &left, &top, &right, &bottom);
-		box[0] = cv::Point(left, top);
-		box[1] = cv::Point(right, top);
-		box[2] = cv::Point(right, bottom);
-		box[3] = cv::Point(left, bottom);
+		box.box = cv::Rect(left, top, right - left, bottom - top);
+		// get the text of the box
+		const char *text = ri->GetUTF8Text(level);
+		box.text = text;
 		// get area of box
 		const int area = (right - left) * (bottom - top);
 		// if the area is too small or too big, relative to the image size - skip the box
@@ -415,19 +416,45 @@ void tesseract_thread(void *data)
 				std::string ocr_result = run_tesseract_ocr(tf, imageForOCR);
 
 				if (is_valid_output_source_name(tf->output_image_source_name)) {
+					cv::Mat text_detection_output(imageBGRA.rows,
+								      imageBGRA.cols, CV_8UC4,
+								      cv::Scalar(0, 0, 0, 0));
+
 					// Extract the text detection boxes
-					std::vector<std::vector<cv::Point>> boxes =
+					std::vector<OCRBox> boxes =
 						extract_text_detection_boxes(tf, imageBGRA.size());
 
-					// Create a text detection binary mask
-					cv::Mat text_detection_mask(imageBGRA.rows, imageBGRA.cols,
-								    CV_8UC1, cv::Scalar(0));
-					for (const std::vector<cv::Point> &box : boxes) {
-						cv::fillConvexPoly(text_detection_mask, box,
-								   cv::Scalar(255));
+					if (tf->output_image_option ==
+					    OUTPUT_IMAGE_OPTION_TEXT_OVERLAY) {
+						// Create a text overlay image
+						QImage text_overlay_image =
+							render_boxes_with_qtextdocument(
+								boxes, imageBGRA.cols,
+								imageBGRA.rows);
+						cv::Mat text_overlay_image_mat(
+							text_overlay_image.height(),
+							text_overlay_image.width(), CV_8UC4,
+							text_overlay_image.bits(),
+							text_overlay_image.bytesPerLine());
+						text_overlay_image_mat.copyTo(
+							text_detection_output);
+						// } else if (tf->output_image_option ==
+						// 	   OUTPUT_IMAGE_OPTION_TEXT_BACKGROUND) {
+						// 	// Draw the text detection boxes on the image with a background
+
+					} else {
+						text_detection_output.setTo(
+							cv::Scalar(0, 0, 0, 255));
+
+						// Create a text detection binary mask
+						for (const auto &box : boxes) {
+							cv::rectangle(
+								text_detection_output, box.box,
+								cv::Scalar(255, 255, 255, 255), -1);
+						}
 					}
 
-					setTextDetectionMaskCallback(text_detection_mask, tf);
+					setTextDetectionMaskCallback(text_detection_output, tf);
 				}
 
 				if (!ocr_result.empty() &&
diff --git a/src/tesseract-ocr-utils.h b/src/tesseract-ocr-utils.h
index 3058bf3..7372a29 100644
--- a/src/tesseract-ocr-utils.h
+++ b/src/tesseract-ocr-utils.h
@@ -6,10 +6,15 @@
 #include <deque>
 #include <string>
 
+struct OCRBox {
+	std::string text;
+	cv::Rect box;
+};
+
 void cleanup_config_files(const std::string &unique_id);
 void initialize_tesseract_ocr(filter_data *tf, bool hard_tesseract_init_required = false);
 std::string run_tesseract_ocr(filter_data *tf, const cv::Mat &imageBGRA);
-std::vector<std::vector<cv::Point>> extract_text_detection_boxes(filter_data *tf);
+std::vector<OCRBox> extract_text_detection_boxes(filter_data *tf);
 std::string strip(const std::string &str);
 void stop_and_join_tesseract_thread(struct filter_data *tf);
 void tesseract_thread(void *data);
diff --git a/src/text-render-helper.cpp b/src/text-render-helper.cpp
new file mode 100644
index 0000000..ffda1bb
--- /dev/null
+++ b/src/text-render-helper.cpp
@@ -0,0 +1,38 @@
+#include "text-render-helper.h"
+
+#include <QPainter>
+#include <QPixmap>
+
+/**
+  * Render text to a buffer using QTextDocument
+  * @param text Text to render
+  * @param width Output width
+  * @param height Output height
+  * @param data Output buffer, user must free
+	* @param css_props CSS properties to apply to the text
+  */
+QImage render_boxes_with_qtextdocument(const std::vector<OCRBox> &boxes, uint32_t width,
+				       uint32_t height)
+{
+	QPixmap pixmap(width, height);
+	pixmap.fill(Qt::transparent);
+	QPainter painter;
+	painter.begin(&pixmap);
+	painter.setCompositionMode(QPainter::CompositionMode_Source);
+
+	// draw individual boxes on the pixmap
+	for (const OCRBox &box : boxes) {
+		painter.setPen(Qt::blue);
+		// set the character size according to the box height
+		QFont font = painter.font();
+		font.setPixelSize(box.box.height);
+		painter.setFont(font);
+		painter.drawText(box.box.x, box.box.y + box.box.height,
+				 QString::fromStdString(box.text));
+	}
+
+	painter.setCompositionMode(QPainter::CompositionMode_DestinationIn);
+	painter.end();
+
+	return pixmap.toImage();
+}
diff --git a/src/text-render-helper.h b/src/text-render-helper.h
new file mode 100644
index 0000000..e299d1a
--- /dev/null
+++ b/src/text-render-helper.h
@@ -0,0 +1,14 @@
+#ifndef TEXT_RENDER_HELPER_H
+#define TEXT_RENDER_HELPER_H
+
+#include "tesseract-ocr-utils.h"
+
+#include <string>
+#include <vector>
+
+#include <QImage>
+
+QImage render_boxes_with_qtextdocument(const std::vector<OCRBox> &boxes, uint32_t width,
+				       uint32_t height);
+
+#endif // TEXT_RENDER_HELPER_H