From 6a2bf21d05cd459a3d7de5b0db65ddf583fc0df4 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 26 Apr 2021 15:55:01 +0100 Subject: [PATCH] lstmeval: Improve output by ensuring 'Truth:' text is encoded the same way as OCR output This ensures that transformations like unicode normalisation are done on the truth output as well as the OCR output, so that you can compare the two properly. Before this a perfect OCR result could show different lines for Truth and OCR if the OCR output included characters that were normalised. --- src/training/unicharset/lstmtester.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/training/unicharset/lstmtester.cpp b/src/training/unicharset/lstmtester.cpp index 80800445b2..0a971dcd57 100644 --- a/src/training/unicharset/lstmtester.cpp +++ b/src/training/unicharset/lstmtester.cpp @@ -91,6 +91,12 @@ std::string LSTMTester::RunEvalSync(int iteration, const double *training_errors int error_count = 0; while (error_count < total_pages_) { const ImageData *trainingdata = test_data_.GetPageBySerial(eval_iteration); + std::vector truth_labels; + if (!trainer.EncodeString(trainingdata->transcription(), &truth_labels)) { + eval_iteration++; + continue; + } + std::string truth_text = trainer.DecodeLabels(truth_labels); trainer.SetIteration(++eval_iteration); NetworkIO fwd_outputs, targets; Trainability result = trainer.PrepareForBackward(trainingdata, &fwd_outputs, &targets); @@ -99,7 +105,7 @@ std::string LSTMTester::RunEvalSync(int iteration, const double *training_errors word_error += trainer.NewSingleError(tesseract::ET_WORD_RECERR); ++error_count; if (verbosity > 1 || (verbosity > 0 && result != PERFECT)) { - tprintf("Truth:%s\n", trainingdata->transcription().c_str()); + tprintf("Truth:%s\n", truth_text.c_str()); std::vector ocr_labels; std::vector xcoords; trainer.LabelsFromOutputs(fwd_outputs, &ocr_labels, &xcoords);