diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a186e32..22bf7293 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Fix for soft hyphen not being replaced by visible hyphen if necessary (#457) - Optimize output files by ignoring identity transforms - Fix for Acroforms - setting an option to false will still apply the flag (#1495) +- Fix for text extraction in PDFium-based viewers due to invalid ToUnicodeMap (#1498) ### [v0.14.0] - 2023-11-09 diff --git a/lib/font/embedded.js b/lib/font/embedded.js index c0312a27..0eddfe0e 100644 --- a/lib/font/embedded.js +++ b/lib/font/embedded.js @@ -252,6 +252,15 @@ class EmbeddedFont extends PDFFont { entries.push(`<${encoded.join(' ')}>`); } + const chunkSize = 256; + const chunks = Math.ceil(entries.length / chunkSize); + const ranges = []; + for (let i = 0; i < chunks; i++) { + const start = i * chunkSize; + const end = Math.min((i + 1) * chunkSize, entries.length); + ranges.push(`<${toHex(start)}> <${toHex(end - 1)}> [${entries.slice(start, end).join(' ')}]`); + } + cmap.end(`\ /CIDInit /ProcSet findresource begin 12 dict begin @@ -267,7 +276,7 @@ begincmap <0000> endcodespacerange 1 beginbfrange -<0000> <${toHex(entries.length - 1)}> [${entries.join(' ')}] +${ranges.join('\n')} endbfrange endcmap CMapName currentdict /CMap defineresource pop diff --git a/tests/unit/font.spec.js b/tests/unit/font.spec.js index 67241554..33e19d6a 100644 --- a/tests/unit/font.spec.js +++ b/tests/unit/font.spec.js @@ -1,5 +1,6 @@ -import PDFFontFactory from '../../lib/font_factory'; import PDFDocument from '../../lib/document'; +import PDFFontFactory from '../../lib/font_factory'; +import { logData } from './helpers'; describe('EmbeddedFont', () => { test('no fontLayoutCache option', () => { @@ -52,4 +53,46 @@ describe('EmbeddedFont', () => { expect(dictionary.data.BaseFont).toBe('BAJJZZ+Roboto-Regular'); }); }); + + describe.only('toUnicodeMap', () => { + test('bfrange lines should not cross highcode boundary', () => { + const doc = new PDFDocument({ compress: false }); + const font = PDFFontFactory.open( + doc, + 'tests/fonts/Roboto-Regular.ttf', + undefined, + 'F1099' + ); + + // 398 different glyphs + font.encode('ABCDEFGHIJKLMNOPQRSTUVWXYZ'); + font.encode('abcdefghijklmnopqrstuvwxyz'); + font.encode('ÁÀÂÄÅÃÆÇÐÉÈÊËÍÌÎÏÑÓÒÔÖÕØŒÞÚÙÛÜÝŸ'); + font.encode('áàâäãåæçðéèêëíìîïıñóòôöõøœßþúùûüýÿ'); + font.encode('ĀĂĄĆČĎĐĒĖĘĚĞĢĪĮİĶŁĹĻĽŃŅŇŌŐŔŖŘŠŚŞȘŢȚŤŪŮŰŲŽŹŻ'); + font.encode('āăąćčďđēėęěğģīįķłĺļľńņňōőŕŗřšśşșţțťūůűųžźż'); + font.encode('ΑΒΓ∆ΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΆΈΉΊΌΎΏΪΫ'); + font.encode('αβγδεζηθικλµνξοπρςστυφχψωάέήίόύώϊϋΐΰ'); + font.encode('АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'); + font.encode('абвгдежзийклмнопрстуфхцчшщъыьэюя'); + font.encode('ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏҐӁҒҖҚҢҮҰҲҶҺӘӢӨӮ'); + font.encode('ѐёђѓєѕіїјљњћќѝўџґӂғҗқңүұҳҷһәӣөӯ'); + + const docData = logData(doc); + font.toUnicodeCmap(); + const text = docData.map((d) => d.toString("utf8")).join(""); + + let glyphs = 0 + for (const block of text.matchAll(/beginbfrange\n((?:.|\n)*?)\nendbfrange/g)) { + for (const line of block[1].matchAll(/^<([0-9a-f]+)>\s+<([0-9a-f]+)>\s+\[/igm)) { + const low = parseInt(line[1], 16); + const high = parseInt(line[2], 16); + glyphs += high - low + 1; + expect(high & 0xFFFFFF00).toBe(low & 0xFFFFFF00); + } + } + + expect(glyphs).toBe(398 + 1); + }); + }); });