diff --git a/src/UglyToad.PdfPig.Fonts/GlyphList.cs b/src/UglyToad.PdfPig.Fonts/GlyphList.cs index aa264b054..b9ce8c53e 100644 --- a/src/UglyToad.PdfPig.Fonts/GlyphList.cs +++ b/src/UglyToad.PdfPig.Fonts/GlyphList.cs @@ -22,20 +22,13 @@ public class GlyphList private readonly Dictionary<string, string> oddNameToUnicodeCache = new Dictionary<string, string>(); - private static readonly Lazy<GlyphList> LazyAdobeGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("glyphlist")); + private static readonly Lazy<GlyphList> LazyAdobeGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("glyphlist", "additional")); /// <summary> - /// The Adobe Glyph List. + /// The Adobe Glyph List (includes an extension to the Adobe Glyph List.). /// </summary> public static GlyphList AdobeGlyphList => LazyAdobeGlyphList.Value; - private static readonly Lazy<GlyphList> LazyAdditionalGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("additional")); - - /// <summary> - /// An extension to the Adobe Glyph List. - /// </summary> - public static GlyphList AdditionalGlyphList => LazyAdditionalGlyphList.Value; - private static readonly Lazy<GlyphList> LazyZapfDingbatsGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("zapfdingbats")); /// <summary> @@ -103,7 +96,7 @@ public string NameToUnicode(string name) return result; } - string unicode; + string? unicode; // 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any. if (name.IndexOf('.') > 0) { diff --git a/src/UglyToad.PdfPig.Fonts/GlyphListFactory.cs b/src/UglyToad.PdfPig.Fonts/GlyphListFactory.cs index 39a6cedfa..667f5de9f 100644 --- a/src/UglyToad.PdfPig.Fonts/GlyphListFactory.cs +++ b/src/UglyToad.PdfPig.Fonts/GlyphListFactory.cs @@ -4,49 +4,56 @@ using System.Collections.Generic; using System.Globalization; using System.IO; + using System.Linq; using Util; - internal class GlyphListFactory + internal static class GlyphListFactory { - public static GlyphList Get(string listName) +#if NET + private const char Semicolon = ';'; +#else + private static readonly char[] Semicolon = [';']; +#endif + + public static GlyphList Get(params string[] listNames) { - using (var resource = - typeof(GlyphListFactory).Assembly.GetManifestResourceStream( - $"UglyToad.PdfPig.Fonts.Resources.GlyphList.{listName}")) + var result = new Dictionary<string, string>(listNames.Any(n => string.Equals("glyphlist", n, StringComparison.OrdinalIgnoreCase)) ? 4300 : 0); + + foreach (var listName in listNames) { - if (resource == null) + using (var resource = + typeof(GlyphListFactory).Assembly.GetManifestResourceStream( + $"UglyToad.PdfPig.Fonts.Resources.GlyphList.{listName}")) { - throw new ArgumentException($"No embedded glyph list resource was found with the name {listName}."); - } + if (resource == null) + { + throw new ArgumentException($"No embedded glyph list resource was found with the name {listName}."); + } - int? capacity = null; - // Prevent too much wasted memory capacity for Adobe GlyphList - if (string.Equals("glyphlist", listName, StringComparison.OrdinalIgnoreCase)) - { - capacity = 4300; + ReadInternal(resource, result); } - - return ReadInternal(resource, capacity); } + +#if NET + result.TrimExcess(); +#endif + return new GlyphList(result); } public static GlyphList Read(Stream stream) { - return ReadInternal(stream); + var result = new Dictionary<string, string>(); + ReadInternal(stream, result); + return new GlyphList(result); } - private static readonly char[] Semicolon = [';']; - - private static GlyphList ReadInternal(Stream stream, int? defaultDictionaryCapacity = 0) + private static void ReadInternal(Stream stream, Dictionary<string, string> result) { if (stream == null) { throw new ArgumentNullException(nameof(stream)); } - var result = defaultDictionaryCapacity.HasValue ? new Dictionary<string, string>(defaultDictionaryCapacity.Value) : []; - - using (var reader = new StreamReader(stream)) { while (!reader.EndOfStream) @@ -62,7 +69,7 @@ private static GlyphList ReadInternal(Stream stream, int? defaultDictionaryCapac { continue; } - + var parts = line.Split(Semicolon, StringSplitOptions.RemoveEmptyEntries); if (parts.Length != 2) @@ -86,11 +93,10 @@ private static GlyphList ReadInternal(Stream stream, int? defaultDictionaryCapac value += char.ConvertFromUtf32(code); } + System.Diagnostics.Debug.Assert(!result.ContainsKey(key)); result[key] = value; } } - - return new GlyphList(result); } } } diff --git a/src/UglyToad.PdfPig.Tests/Integration/AdditionalGlyphListTests.cs b/src/UglyToad.PdfPig.Tests/Integration/AdditionalGlyphListTests.cs new file mode 100644 index 000000000..002b21679 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/AdditionalGlyphListTests.cs @@ -0,0 +1,42 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System.Linq; + + public class AdditionalGlyphListTests + { + [Fact] + public void Type1FontSimple1() + { + using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("2108.11480"))) + { + var page = document.GetPage(2); + Assert.Contains("\u22c3", page.Letters.Select(l => l.Value)); + } + } + + [Fact] + public void Type1FontSimple2() + { + using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("ICML03-081"))) + { + var page = document.GetPage(2); + Assert.Contains("\u2211", page.Letters.Select(l => l.Value)); + Assert.Contains("\u220f", page.Letters.Select(l => l.Value)); + Assert.Contains("[", page.Letters.Select(l => l.Value)); + Assert.Contains("]", page.Letters.Select(l => l.Value)); + } + } + + [Fact] + public void Type1FontSimple3() + { + using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Math119FakingData"))) + { + var page = document.GetPage(4); + Assert.Contains("(", page.Letters.Select(l => l.Value)); + Assert.Contains(")", page.Letters.Select(l => l.Value)); + Assert.Contains("\u2211", page.Letters.Select(l => l.Value)); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/MOZILLA-2775-1.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/MOZILLA-2775-1.pdf new file mode 100644 index 000000000..69b23864e Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/MOZILLA-2775-1.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/MOZILLA-LINK-5251-1.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/MOZILLA-LINK-5251-1.pdf new file mode 100644 index 000000000..144e9a760 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/MOZILLA-LINK-5251-1.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/PDFBOX-492-4.jar-8.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/PDFBOX-492-4.jar-8.pdf new file mode 100644 index 000000000..69ca9cd84 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/PDFBOX-492-4.jar-8.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-469-0.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-469-0.pdf new file mode 100644 index 000000000..46ae14568 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-469-0.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/ZapfDingbatsTests.cs b/src/UglyToad.PdfPig.Tests/Integration/ZapfDingbatsTests.cs new file mode 100644 index 000000000..86a7c9b1c --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/ZapfDingbatsTests.cs @@ -0,0 +1,55 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System.Linq; + + public class ZapfDingbatsTests + { + [Fact] + public void Type1Standard14Font1() + { + using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("TIKA-469-0"))) + { + var page = document.GetPage(2); + Assert.Contains("●", page.Letters.Select(l => l.Value)); + } + } + + [Fact] + public void Type1Standard14Font2() + { + // This document does not actually contain circular references + using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("MOZILLA-LINK-5251-1"))) + { + var page = document.GetPage(1); + Assert.Contains("✁", page.Letters.Select(l => l.Value)); + Assert.Contains("✂", page.Letters.Select(l => l.Value)); + Assert.Contains("✄", page.Letters.Select(l => l.Value)); + Assert.Contains("☎", page.Letters.Select(l => l.Value)); + Assert.Contains("✆", page.Letters.Select(l => l.Value)); + Assert.Contains("✇", page.Letters.Select(l => l.Value)); + } + } + + [Fact] + public void Type1FontSimple1() + { + // This document does not actually contain circular references + using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("MOZILLA-2775-1"))) + { + var page = document.GetPage(11); + Assert.Contains("●", page.Letters.Select(l => l.Value)); + } + } + + [Fact] + public void Type1FontSimple2() + { + // This document does not actually contain circular references + using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("PDFBOX-492-4.jar-8"))) + { + var page = document.GetPage(1); + Assert.Contains("\u25a0", page.Letters.Select(l => l.Value)); + } + } + } +} diff --git a/src/UglyToad.PdfPig/PdfFonts/Simple/TrueTypeSimpleFont.cs b/src/UglyToad.PdfPig/PdfFonts/Simple/TrueTypeSimpleFont.cs index 13e4162d9..d23beba69 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Simple/TrueTypeSimpleFont.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Simple/TrueTypeSimpleFont.cs @@ -62,6 +62,9 @@ public TrueTypeSimpleFont( Details = descriptor?.ToDetails(Name?.Data) ?? FontDetails.GetDefault(Name?.Data); + + // Assumption is ZapfDingbats is not possible here. We need to change the behaviour if not the case + System.Diagnostics.Debug.Assert(!(encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats"))); } public int ReadCharacterCode(IInputBytes bytes, out int codeLength) @@ -102,8 +105,7 @@ public bool TryGetUnicode(int characterCode, [NotNullWhen(true)] out string? val // Look up the character name in the Adobe Glyph List or additional Glyph List. try { - value = GlyphList.AdobeGlyphList.NameToUnicode(encodedCharacterName) - ?? GlyphList.AdditionalGlyphList.NameToUnicode(encodedCharacterName); + value = GlyphList.AdobeGlyphList.NameToUnicode(encodedCharacterName); } catch { diff --git a/src/UglyToad.PdfPig/PdfFonts/Simple/TrueTypeStandard14FallbackSimpleFont.cs b/src/UglyToad.PdfPig/PdfFonts/Simple/TrueTypeStandard14FallbackSimpleFont.cs index 0b3ad1475..c4aa78a92 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Simple/TrueTypeStandard14FallbackSimpleFont.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Simple/TrueTypeStandard14FallbackSimpleFont.cs @@ -42,6 +42,9 @@ public TrueTypeStandard14FallbackSimpleFont(NameToken name, AdobeFontMetrics fon fontMetrics.Weight == "Bold", fontMetrics.Weight == "Bold" ? 700 : FontDetails.DefaultWeight, fontMetrics.ItalicAngle != 0); + + // Assumption is ZapfDingbats is not possible here. We need to change the behaviour if not the case + System.Diagnostics.Debug.Assert(!(encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats"))); } public int ReadCharacterCode(IInputBytes bytes, out int codeLength) diff --git a/src/UglyToad.PdfPig/PdfFonts/Simple/Type1FontSimple.cs b/src/UglyToad.PdfPig/PdfFonts/Simple/Type1FontSimple.cs index 04f4e6042..ca0076952 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Simple/Type1FontSimple.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Simple/Type1FontSimple.cs @@ -37,6 +37,8 @@ internal sealed class Type1FontSimple : IFont private readonly TransformationMatrix fontMatrix; + private readonly bool isZapfDingbats; + public NameToken Name { get; } public bool IsVertical { get; } = false; @@ -80,6 +82,7 @@ public Type1FontSimple( Name = name; Details = fontDescriptor?.ToDetails(name?.Data) ?? FontDetails.GetDefault(name?.Data); + isZapfDingbats = encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats"); } public int ReadCharacterCode(IInputBytes bytes, out int codeLength) @@ -124,6 +127,14 @@ public bool TryGetUnicode(int characterCode, [NotNullWhen(true)] out string? val try { + if (isZapfDingbats) + { + value = GlyphList.ZapfDingbats.NameToUnicode(name); + if (value is not null) + { + return true; + } + } value = GlyphList.AdobeGlyphList.NameToUnicode(name); } catch diff --git a/src/UglyToad.PdfPig/PdfFonts/Simple/Type1Standard14Font.cs b/src/UglyToad.PdfPig/PdfFonts/Simple/Type1Standard14Font.cs index 51316f814..f84426136 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Simple/Type1Standard14Font.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Simple/Type1Standard14Font.cs @@ -18,6 +18,7 @@ internal sealed class Type1Standard14Font : IFont { private readonly AdobeFontMetrics standardFontMetrics; private readonly Encoding encoding; + private readonly bool isZapfDingbats; public NameToken Name { get; } @@ -39,6 +40,7 @@ public Type1Standard14Font(AdobeFontMetrics standardFontMetrics, Encoding? overr standardFontMetrics.Weight == "Bold", standardFontMetrics.Weight == "Bold" ? 700 : FontDetails.DefaultWeight, standardFontMetrics.ItalicAngle != 0); + isZapfDingbats = encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats"); } public int ReadCharacterCode(IInputBytes bytes, out int codeLength) @@ -49,39 +51,35 @@ public int ReadCharacterCode(IInputBytes bytes, out int codeLength) public bool TryGetUnicode(int characterCode, [NotNullWhen(true)] out string? value) { + value = null; + var name = encoding.GetName(characterCode); + if (string.Equals(name, GlyphList.NotDefined, StringComparison.OrdinalIgnoreCase)) { - value = null; return false; } - if (encoding is ZapfDingbatsEncoding) + try { - var listed = GlyphList.ZapfDingbats.NameToUnicode(name); + if (isZapfDingbats) + { + value = GlyphList.ZapfDingbats.NameToUnicode(name); - value = listed; + if (value is not null) + { + return true; + } + } - return true; + value = GlyphList.AdobeGlyphList.NameToUnicode(name); } - - if (encoding is StandardEncoding || encoding is SymbolEncoding) + catch { - var listed = GlyphList.AdobeGlyphList.NameToUnicode(name); - - value = listed; - - return true; + return false; } - else - { - Debug.WriteLine($"Warning: Type1Standard14Font with unexpected encoding: '{encoding.EncodingName}' Expected: 'ZapfDingbatsEncoding','SymbolEncoding' or 'StandardEncoding' . Font: '{standardFontMetrics.FontName}'"); - var listed = GlyphList.AdobeGlyphList.NameToUnicode(name); - value = listed; - - return true; - } + return value is not null; } public CharacterBoundingBox GetBoundingBox(int characterCode) diff --git a/src/UglyToad.PdfPig/PdfFonts/Simple/Type3Font.cs b/src/UglyToad.PdfPig/PdfFonts/Simple/Type3Font.cs index f3097b54e..5fc4f7a0c 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Simple/Type3Font.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Simple/Type3Font.cs @@ -42,6 +42,9 @@ public Type3Font(NameToken name, PdfRectangle boundingBox, TransformationMatrix this.widths = widths; this.toUnicodeCMap = new ToUnicodeCMap(toUnicodeCMap); Details = FontDetails.GetDefault(name?.Data); + + // Assumption is ZapfDingbats is not possible here. We need to change the behaviour if not the case + System.Diagnostics.Debug.Assert(!(encoding is ZapfDingbatsEncoding || Details.Name.Contains("ZapfDingbats"))); } public int ReadCharacterCode(IInputBytes bytes, out int codeLength)