Skip to content

Commit

Permalink
Improve OCR text alignment
Browse files Browse the repository at this point in the history
This is nearly a full rewrite of the alignment code. Position is now based on the line baseline (provided by Tesseract) and the font size is smarter (defaulting to Tesseract's provided value with various adjustments).

The goals were:
- Have Ctrl+F highlight the word as accurately as possible.
- Have Ctrl+A/Ctrl+C end up with text that matches the original as closely as possible.
- Have PdfSharp and Pdfium produce consistent output.
On my test cases all goals are fully met.

#236
  • Loading branch information
cyanfish committed Mar 27, 2024
1 parent 1abcdd6 commit 79bba70
Show file tree
Hide file tree
Showing 9 changed files with 254 additions and 133 deletions.
2 changes: 1 addition & 1 deletion NAPS2.App.Tests/Appium/ImportAndSaveTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public void ImportVariousAndSavePdfWithOcr(IAppTestTarget target)
PdfAsserts.AssertContainsTextOnce("Page one.", path);
PdfAsserts.AssertContainsTextOnce("Page two.", path);
PdfAsserts.AssertContainsTextOnce("ADVERTISEMENT.", path);
PdfAsserts.AssertContainsTextOnce("Patch Code separator sheet geometry", path);
PdfAsserts.AssertContainsTextOnce("Sized for printing unscaled", path);
AppTestHelper.AssertNoErrorLog(FolderPath);
}

Expand Down
9 changes: 6 additions & 3 deletions NAPS2.Sdk.Tests/ContextualTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,13 @@ public void SetUpFakeOcr(Dictionary<IMemoryImage, string> ocrTextByImage = null,
var ocrImage = ImageContext.Load(path);
await Task.Delay(delay);

OcrResult CreateOcrResult(string text) => new((0, 0, 100, 100),
ImmutableList.Create(
OcrResult CreateOcrResult(string text)
{
var list = ImmutableList.Create(
new OcrResultElement(text, ocrParams.LanguageCode!, false,
(10, 10, 10, 10))));
(10, 10, 10, 10), 0, 10, ImmutableList<OcrResultElement>.Empty));
return new((0, 0, 100, 100), list, list);
}

if (ocrTextByImage != null)
{
Expand Down
4 changes: 2 additions & 2 deletions NAPS2.Sdk.Tests/Ocr/OcrRequestQueueTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,8 @@ private string CreateTempFile()

private static OcrResult CreateOcrResult()
{
var uniqueElement = new OcrResultElement(Guid.NewGuid().ToString(), "eng", false, (0, 0, 1, 1));
return new OcrResult((0, 0, 1, 1), ImmutableList<OcrResultElement>.Empty.Add(uniqueElement));
var uniqueElement = new OcrResultElement(Guid.NewGuid().ToString(), "eng", false, (0, 0, 1, 1), 0, 10, ImmutableList<OcrResultElement>.Empty);
return new OcrResult((0, 0, 1, 1), ImmutableList.Create(uniqueElement), ImmutableList.Create(uniqueElement));
}

private static OcrParams CreateOcrParams()
Expand Down
22 changes: 11 additions & 11 deletions NAPS2.Sdk.Tests/Ocr/TesseractOcrEngineTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,31 +26,31 @@ public async Task ProcessEnglishImage()
var ocrParams = new OcrParams("eng", OcrMode.Fast, 0);
var result = await _engine.ProcessImage(ScanningContext, _testImagePath, ocrParams, CancellationToken.None);
Assert.NotNull(result);
Assert.NotEmpty(result.Elements);
foreach (var element in result.Elements)
Assert.NotEmpty(result.Words);
foreach (var element in result.Words)
{
Assert.Equal("eng", element.LanguageCode);
Assert.False(element.RightToLeft);
}
Assert.Equal("ADVERTISEMENT.", result.Elements[0].Text);
Assert.InRange(result.Elements[0].Bounds.x, 139, 149);
Assert.InRange(result.Elements[0].Bounds.y, 26, 36);
Assert.InRange(result.Elements[0].Bounds.w, 237, 247);
Assert.InRange(result.Elements[0].Bounds.h, 17, 27);
Assert.Equal("ADVERTISEMENT.", result.Words[0].Text);
Assert.InRange(result.Words[0].Bounds.x, 139, 149);
Assert.InRange(result.Words[0].Bounds.y, 26, 36);
Assert.InRange(result.Words[0].Bounds.w, 237, 247);
Assert.InRange(result.Words[0].Bounds.h, 17, 27);
}

[Fact]
public async Task ProcessHebrewImage()
{
var result = await _engine.ProcessImage(ScanningContext, _testImagePathHebrew, new OcrParams("heb", OcrMode.Fast, 0), CancellationToken.None);
Assert.NotNull(result);
Assert.NotEmpty(result.Elements);
foreach (var element in result.Elements)
Assert.NotEmpty(result.Words);
foreach (var element in result.Words)
{
Assert.Equal("heb", element.LanguageCode);
Assert.True(element.RightToLeft);
}
Assert.Equal("הקדמת", result.Elements[0].Text);
Assert.Equal("הקדמת", result.Words[0].Text);
}

[Fact(Skip = "flaky")]
Expand Down Expand Up @@ -97,6 +97,6 @@ public async Task Mode()
var mode = OcrMode.Best;
var result = await _engine.ProcessImage(ScanningContext, _testImagePath, new OcrParams("eng", mode, 0), CancellationToken.None);
Assert.NotNull(result);
Assert.Equal("ADVERTISEMENT.", result.Elements[0].Text);
Assert.Equal("ADVERTISEMENT.", result.Words[0].Text);
}
}
15 changes: 7 additions & 8 deletions NAPS2.Sdk/Ocr/OcrResult.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@ namespace NAPS2.Ocr;
/// <summary>
/// The result of an OCR request. Contains a set of elements that represent text segments.
/// </summary>
public class OcrResult
public class OcrResult(
(int x, int y, int w, int h) pageBounds,
ImmutableList<OcrResultElement> words,
ImmutableList<OcrResultElement> lines)
{
public OcrResult((int x, int y, int w, int h) pageBounds, ImmutableList<OcrResultElement> elements)
{
PageBounds = pageBounds;
Elements = elements;
}
public (int x, int y, int w, int h) PageBounds { get; } = pageBounds;

public (int x, int y, int w, int h) PageBounds { get; }
public ImmutableList<OcrResultElement> Words { get; } = words;

public ImmutableList<OcrResultElement> Elements { get; }
public ImmutableList<OcrResultElement> Lines { get; } = lines;
}
13 changes: 11 additions & 2 deletions NAPS2.Sdk/Ocr/OcrResultElement.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
namespace NAPS2.Ocr;
using System.Collections.Immutable;

namespace NAPS2.Ocr;

/// <summary>
/// A element in the result of an OCR request that represents a text segment.
/// </summary>
public record OcrResultElement(string Text, string LanguageCode, bool RightToLeft, (int x, int y, int w, int h) Bounds);
public record OcrResultElement(
string Text,
string LanguageCode,
bool RightToLeft,
(int x, int y, int w, int h) Bounds,
int Baseline,
int FontSize,
ImmutableList<OcrResultElement> Children);
162 changes: 113 additions & 49 deletions NAPS2.Sdk/Ocr/TesseractOcrEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using Microsoft.Extensions.Logging;
using NAPS2.Scan;
using NAPS2.Unmanaged;
using Bounds = (int x, int y, int w, int h);

namespace NAPS2.Ocr;

Expand Down Expand Up @@ -74,10 +75,11 @@ private TesseractOcrEngine(string tesseractPath, string? languageDataBasePath =
{
PreProcessImage(scanningContext, imagePath);
}
var configVals = "-c tessedit_create_hocr=1 -c hocr_font_info=1";
var startInfo = new ProcessStartInfo
{
FileName = _tesseractPath,
Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} hocr",
Arguments = $"\"{imagePath}\" \"{tempHocrFilePath}\" -l {ocrParams.LanguageCode} {configVals}",
UseShellExecute = false,
CreateNoWindow = true,
RedirectStandardOutput = true,
Expand All @@ -92,8 +94,6 @@ private TesseractOcrEngine(string tesseractPath, string? languageDataBasePath =
languageDataPath = Path.Combine(languageDataPath, subfolder);
}
startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = languageDataPath;
var tessdata = new DirectoryInfo(languageDataPath);
EnsureHocrConfigExists(tessdata);
}
var tesseractProcess = Process.Start(startInfo);
if (tesseractProcess == null)
Expand Down Expand Up @@ -150,22 +150,7 @@ private TesseractOcrEngine(string tesseractPath, string? languageDataBasePath =
}
#endif
XDocument hocrDocument = XDocument.Load(tempHocrFilePathWithExt);
var pageBounds = hocrDocument.Descendants()
.Where(x => x.Attributes("class").Any(y => y.Value == "ocr_page"))
.Select(x => GetBounds(x.Attribute("title")))
.First();
var elements = hocrDocument.Descendants()
.Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word"))
.Where(x => !string.IsNullOrWhiteSpace(x.Value))
.Select(x =>
{
var text = x.Value;
var lang = GetNearestAncestorAttribute(x, "lang") ?? "";
var rtl = GetNearestAncestorAttribute(x, "dir") == "rtl";
var bounds = GetBounds(x.Attribute("title"));
return new OcrResultElement(text, lang, rtl, bounds);
}).ToImmutableList();
return new OcrResult(pageBounds, elements);
return CreateOcrResult(hocrDocument);
}
catch (XmlException e)
{
Expand Down Expand Up @@ -211,57 +196,136 @@ private static void PreProcessImage(ScanningContext scanningContext, string imag
}
}

private OcrResult CreateOcrResult(XDocument hocrDocument)
{
var pageBounds = hocrDocument.Descendants()
.Where(element => GetClass(element) == "ocr_page")
.Select(GetBounds)
.First();
var words = new List<OcrResultElement>();
var lines = new List<OcrResultElement>();
foreach (var lineElement in hocrDocument.Descendants()
.Where(element => GetClass(element) is "ocr_line" or "ocr_header" or "ocr_textfloat"))
{
var lineBounds = GetBounds(lineElement);
var lineAngle = GetTextAngle(lineElement);
bool isRotated = lineAngle is >= 45 or <= -45;
var baselineParams = GetBaselineParams(lineElement);
var lineWords = lineElement.Descendants()
.Where(element => GetClass(element) == "ocrx_word")
.Where(element => !string.IsNullOrWhiteSpace(element.Value))
.Select(wordElement =>
{
var wordBounds = GetBounds(wordElement);
return new OcrResultElement(
wordElement.Value,
GetNearestAncestorAttribute(wordElement, "lang") ?? "",
GetNearestAncestorAttribute(wordElement, "dir") == "rtl",
wordBounds,
// TODO: Maybe we can properly handle rotated text?
isRotated
? wordBounds.y + wordBounds.h
: CalculateBaseline(baselineParams, lineBounds, wordBounds),
GetFontSize(wordElement),
ImmutableList<OcrResultElement>.Empty);
}).ToImmutableList();
if (lineWords.Count == 0) continue;
words.AddRange(lineWords);
lines.Add(lineWords[0] with
{
Text = string.Join(" ", lineWords.Select(x => x.Text)),
Bounds = lineBounds,
Baseline = CalculateBaseline(baselineParams, lineBounds, lineBounds),
Children = lineWords
});
}
return new OcrResult(pageBounds, words.ToImmutableList(), lines.ToImmutableList());
}

private static string? GetNearestAncestorAttribute(XElement x, string attributeName)
{
var ancestor = x.AncestorsAndSelf().FirstOrDefault(x => x.Attribute(attributeName) != null);
return ancestor?.Attribute(attributeName)?.Value;
}

private void EnsureHocrConfigExists(DirectoryInfo tessdata)
private string? GetClass(XElement? element)
{
try
{
var configDir = new DirectoryInfo(Path.Combine(tessdata.FullName, "configs"));
if (!configDir.Exists)
{
configDir.Create();
}
var hocrConfigFile = new FileInfo(Path.Combine(configDir.FullName, "hocr"));
if (!hocrConfigFile.Exists)
{
using var writer = hocrConfigFile.CreateText();
writer.Write("tessedit_create_hocr 1");
}
}
catch (Exception)
{
// Possibly contention over creating the file. As long as it's created assume everything is okay.
if (!File.Exists(Path.Combine(tessdata.FullName, "configs", "hocr")))
{
throw;
}
}
return element?.Attribute("class")?.Value;
}

private (int x, int y, int w, int h) GetBounds(XAttribute? titleAttr)
private bool ParseData(XElement? element, string dataKey, int dataCount, out string[] parts)
{
var bounds = (0, 0, 0, 0);
parts = Array.Empty<string>();
var titleAttr = element?.Attribute("title");
if (titleAttr != null)
{
foreach (var param in titleAttr.Value.Split(';'))
{
string[] parts = param.Trim().Split(' ');
if (parts.Length == 5 && parts[0] == "bbox")
parts = param.Trim().Split(' ');
if (parts[0] == dataKey && parts.Length == dataCount + 1)
{
int x1 = int.Parse(parts[1]), y1 = int.Parse(parts[2]);
int x2 = int.Parse(parts[3]), y2 = int.Parse(parts[4]);
bounds = (x1, y1, x2 - x1, y2 - y1);
return true;
}
}
}
return false;
}

private Bounds GetBounds(XElement? element)
{
var bounds = (0, 0, 0, 0);
if (ParseData(element, "bbox", 4, out string[] parts))
{
int x1 = int.Parse(parts[1]), y1 = int.Parse(parts[2]);
int x2 = int.Parse(parts[3]), y2 = int.Parse(parts[4]);
bounds = (x1, y1, x2 - x1, y2 - y1);
}
return bounds;
}

private int GetFontSize(XElement? element)
{
int fontSize = 0;
if (ParseData(element, "x_fsize", 1, out string[] parts))
{
fontSize = int.Parse(parts[1]);
}
return fontSize;
}

private (float m, float b) GetBaselineParams(XElement? element)
{
float m = 0;
float b = 0;
if (ParseData(element, "baseline", 2, out string[] parts))
{
m = float.Parse(parts[1]);
b = float.Parse(parts[2]);
}
return (m, b);
}

private float GetTextAngle(XElement? element)
{
float angle = 0;
if (ParseData(element, "textangle", 1, out string[] parts))
{
angle = float.Parse(parts[1]);
}
return angle;
}

private int CalculateBaseline((float m, float b) baselineParams, Bounds lineBounds, Bounds elementBounds)
{
// The line baseline is a linear equation (y=mx + b), so we calculate the word baseline from the
// word offset to the left side of the line.
float midpoint = elementBounds.x + elementBounds.w / 2f;
int relativeBaseline = (int) Math.Round(baselineParams.b +
baselineParams.m * (midpoint - lineBounds.x));
int absoluteBaseline = relativeBaseline + lineBounds.y + lineBounds.h;
return absoluteBaseline;
}

// TODO: Consider adding back CanProcess, or otherwise using this code to get the languages from a system engine
// private void CheckIfInstalled()
// {
Expand Down
Loading

0 comments on commit 79bba70

Please sign in to comment.