From 6a2bf21d05cd459a3d7de5b0db65ddf583fc0df4 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Mon, 26 Apr 2021 15:55:01 +0100
Subject: [PATCH] lstmeval: Improve output by ensuring 'Truth:' text is encoded
 the same way as OCR output

This ensures that transformations like unicode normalisation are done on
the truth output as well as the OCR output, so that you can compare
the two properly.

Before this a perfect OCR result could show different lines for Truth and
OCR if the OCR output included characters that were normalised.
---
 src/training/unicharset/lstmtester.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/src/training/unicharset/lstmtester.cpp b/src/training/unicharset/lstmtester.cpp
index 80800445b2..0a971dcd57 100644
--- a/src/training/unicharset/lstmtester.cpp
+++ b/src/training/unicharset/lstmtester.cpp
@@ -91,6 +91,12 @@ std::string LSTMTester::RunEvalSync(int iteration, const double *training_errors
   int error_count = 0;
   while (error_count < total_pages_) {
     const ImageData *trainingdata = test_data_.GetPageBySerial(eval_iteration);
+    std::vector<int> truth_labels;
+    if (!trainer.EncodeString(trainingdata->transcription(), &truth_labels)) {
+      eval_iteration++;
+      continue;
+    }
+    std::string truth_text = trainer.DecodeLabels(truth_labels);
     trainer.SetIteration(++eval_iteration);
     NetworkIO fwd_outputs, targets;
     Trainability result = trainer.PrepareForBackward(trainingdata, &fwd_outputs, &targets);
@@ -99,7 +105,7 @@ std::string LSTMTester::RunEvalSync(int iteration, const double *training_errors
       word_error += trainer.NewSingleError(tesseract::ET_WORD_RECERR);
       ++error_count;
       if (verbosity > 1 || (verbosity > 0 && result != PERFECT)) {
-        tprintf("Truth:%s\n", trainingdata->transcription().c_str());
+        tprintf("Truth:%s\n", truth_text.c_str());
         std::vector<int> ocr_labels;
         std::vector<int> xcoords;
         trainer.LabelsFromOutputs(fwd_outputs, &ocr_labels, &xcoords);