From f775a859a3329679970fc3c4ec1bbf466e560abc Mon Sep 17 00:00:00 2001 From: Andrew Trefethen Date: Fri, 15 Apr 2022 19:46:04 -0500 Subject: [PATCH 1/3] Make Stream Parse ignore words that contain 'stream' Some Embedded fonts include the license information, and one example font included the word 'bitstream' in the text, which confused the stream parser. This Commit updates the parse to require 'stream' to be proceeded by one of multiple special characters. --- src/core/parser/PDFObjectParser.ts | 15 ++++++++++++++- src/core/syntax/Keywords.ts | 9 ++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/core/parser/PDFObjectParser.ts b/src/core/parser/PDFObjectParser.ts index c51b59fd4..6f84927cb 100644 --- a/src/core/parser/PDFObjectParser.ts +++ b/src/core/parser/PDFObjectParser.ts @@ -243,14 +243,25 @@ class PDFObjectParser extends BaseParser { protected findEndOfStreamFallback(startPos: Position) { // Move to end of stream, while handling nested streams + let acceptUnprefixedStream = true; let nestingLvl = 1; let end = this.bytes.offset(); while (!this.bytes.done()) { end = this.bytes.offset(); - if (this.matchKeyword(Keywords.stream)) { + if ( + this.matchKeyword(Keywords.embeddedStream1) || + this.matchKeyword(Keywords.embeddedStream2) || + this.matchKeyword(Keywords.embeddedStream3) || + this.matchKeyword(Keywords.embeddedStream4) || + this.matchKeyword(Keywords.embeddedStream5) || + this.matchKeyword(Keywords.embeddedStream6) || + this.matchKeyword(Keywords.embeddedStream7) || + (acceptUnprefixedStream && this.matchKeyword(Keywords.stream)) + ) { nestingLvl += 1; + acceptUnprefixedStream = true; } else if ( this.matchKeyword(Keywords.EOF1endstream) || this.matchKeyword(Keywords.EOF2endstream) || @@ -258,8 +269,10 @@ class PDFObjectParser extends BaseParser { this.matchKeyword(Keywords.endstream) ) { nestingLvl -= 1; + acceptUnprefixedStream = true; } else { this.bytes.next(); + acceptUnprefixedStream = false; } if (nestingLvl === 0) break; diff --git a/src/core/syntax/Keywords.ts b/src/core/syntax/Keywords.ts index 12e2f957c..2f13c431a 100644 --- a/src/core/syntax/Keywords.ts +++ b/src/core/syntax/Keywords.ts @@ -1,6 +1,6 @@ import CharCodes from 'src/core/syntax/CharCodes'; -const { Space, CarriageReturn, Newline } = CharCodes; +const { Space, CarriageReturn, Newline, Tab, LessThan, GreaterThan, BackSlash } = CharCodes; const stream = [ CharCodes.s, @@ -76,6 +76,13 @@ export const Keywords = { streamEOF2: [...stream, CarriageReturn, Newline], streamEOF3: [...stream, CarriageReturn], streamEOF4: [...stream, Newline], + embeddedStream1: [Tab, ...stream], + embeddedStream2: [Space, ...stream], + embeddedStream3: [CarriageReturn, ...stream], + embeddedStream4: [Newline, ...stream], + embeddedStream5: [LessThan, ...stream], + embeddedStream6: [GreaterThan, ...stream], + embeddedStream7: [BackSlash, ...stream], endstream, EOF1endstream: [CarriageReturn, Newline, ...endstream], EOF2endstream: [CarriageReturn, ...endstream], From db3e87558b01d6bee477818c157daa6b97ca3b88 Mon Sep 17 00:00:00 2001 From: Andrew Trefethen Date: Fri, 15 Apr 2022 19:46:30 -0500 Subject: [PATCH 2/3] Test that the stream parser ignore words containing 'stream' --- tests/core/parser/PDFObjectParser.spec.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/core/parser/PDFObjectParser.spec.ts b/tests/core/parser/PDFObjectParser.spec.ts index 576dfe14a..5c171ab09 100644 --- a/tests/core/parser/PDFObjectParser.spec.ts +++ b/tests/core/parser/PDFObjectParser.spec.ts @@ -576,6 +576,10 @@ describe(`PDFObjectParser`, () => { '<<>>\n\rstream\n\rthingz\n\rendstream', '<<\n/Length 8\n>>\nstream\n\rthingz\n\nendstream', ], + [ + '<<>>\n\rstream\n\rthingz bitstream\n\rendstream', + '<<\n/Length 18\n>>\nstream\n\rthingz bitstream\n\nendstream', + ], ].forEach(([input, output]) => { it(`can parse ${JSON.stringify(input)}`, () => { const object = parse(typedArrayFor(input)); From 0e7f44babd9ccf5ca58e57de16bcf68c7ffed291 Mon Sep 17 00:00:00 2001 From: Andrew Trefethen Date: Fri, 15 Apr 2022 20:05:58 -0500 Subject: [PATCH 3/3] Conform to linter rules --- src/core/syntax/Keywords.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/core/syntax/Keywords.ts b/src/core/syntax/Keywords.ts index 2f13c431a..5989af850 100644 --- a/src/core/syntax/Keywords.ts +++ b/src/core/syntax/Keywords.ts @@ -1,6 +1,14 @@ import CharCodes from 'src/core/syntax/CharCodes'; -const { Space, CarriageReturn, Newline, Tab, LessThan, GreaterThan, BackSlash } = CharCodes; +const { + Space, + CarriageReturn, + Newline, + Tab, + LessThan, + GreaterThan, + BackSlash, +} = CharCodes; const stream = [ CharCodes.s,