From f1b67e3c115601cc73bc0cfb6ba4467a35bcb025 Mon Sep 17 00:00:00 2001 From: David Warring Date: Thu, 19 Oct 2023 14:12:34 +1300 Subject: [PATCH] filter control characters from content-level actual text Seeing backspaces, which are maybe meant to represent underscores. Upsets XML, which just doesn't allow them. --- .github/workflows/test.yml | 2 +- README.md | 17 +++-------------- docs/PDF/Tags/Mark.md | 2 +- docs/PDF/Tags/Node.md | 2 +- docs/PDF/Tags/XML-Writer.md | 4 ++-- docs/index.md | 17 +++-------------- lib/PDF/Tags/Mark.rakumod | 14 ++++++++++++-- lib/PDF/Tags/Node.rakumod | 2 +- lib/PDF/Tags/Node/Parent.rakumod | 2 +- lib/PDF/Tags/XML-Writer.rakumod | 15 +++++++-------- 10 files changed, 32 insertions(+), 45 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 813b3b7..a19c0c7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: #- windows-latest raku-version: - 'latest' - - '2021.12' + - '2022.07' runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index 7c2f3c1..d260958 100644 --- a/README.md +++ b/README.md @@ -188,12 +188,10 @@ my PDF::XObject::Image $img .= open: "t/images/lightbulb.gif"; my $figure = $doc.Figure: $gfx, $img, :position[50, 70], :Alt("A light-bulb"); ``` -An [PDF::XObject::Form](https://pdf-raku.github.io/PDF-Class-raku/PDF/XObject/Form) may be associated with a marked content -sub-tree. This is achieved by marking the form against a document fragment, then calling `do` to repeatably -render the form, while inserting the fragment, as demonstrated below: +A [PDF::XObject::Form](https://pdf-raku.github.io/PDF-Class-raku/PDF/XObject/Form) may be associated with a document fragment. The +form can then be rendered, and the fragment inserted into the document, by repeatedly calling `do` on the fragment, as demonstrated below: ```raku - use PDF::Tags; use PDF::Tags::Elem; use PDF::Class; @@ -222,20 +220,11 @@ $page.graphics: -> $gfx { }; } - # multiple rendering of the form, and insertion of its structure tree + # multiple rendering of the form, and insertion into the structure tree $doc.do($gfx, $form-frag, :position[150, 70]); $doc.do($gfx, $form-frag, :position[150, 20]); } - ``` - -To insert an XObject Form that has marked content: - -1. Create a new fragment element. -2. Create the Form XObject, marking content against the fragment -3. The `do` method can then be used to both render and insert -a copy of the fragment into the structure tree. - ### Links Links are usually contained in a block element, such as a `Paragraph`. If diff --git a/docs/PDF/Tags/Mark.md b/docs/PDF/Tags/Mark.md index 8b363e6..e2b0ce2 100644 --- a/docs/PDF/Tags/Mark.md +++ b/docs/PDF/Tags/Mark.md @@ -92,7 +92,7 @@ The Marked Content ID within the content stream. These are usually numbered in s method value() returns PDF::Content::Tag -The low-level [PDF::Content::Tag](https://pdf-raku.github.io/PDF-Content-raku) object, which contains further details on the tag: +The low-level [PDF::Content::Tag](https://pdf-raku.github.io/PDF-Content-raku/PDF/Content/Tag) object, which contains further details on the tag: * `canvas` - The owner of the content stream; a PDF::Page or PDF::XObject::Form object. diff --git a/docs/PDF/Tags/Node.md b/docs/PDF/Tags/Node.md index d8320de..cee287a 100644 --- a/docs/PDF/Tags/Node.md +++ b/docs/PDF/Tags/Node.md @@ -13,7 +13,7 @@ Methods ### method cos -Returns the underlying [PDF::Class](https://pdf-raku.github.io/PDF-Class-raku) or [PDF::Content](https://pdf-raku.github.io/PDF-Content-raku) object. The [PDF::Tags::Node](https://pdf-raku.github.io/PDF-Tags-raku/PDF/Tags/Node) subclass and [PDF::COS](https://pdf-raku.github.io/PDF-raku) type are mapped as follows: +Returns the underlying [PDF::Class](https://pdf-raku.github.io/PDF-Class-raku) or [PDF::Content](https://pdf-raku.github.io/PDF-Content-raku/PDF/Content) object. The [PDF::Tags::Node](https://pdf-raku.github.io/PDF-Tags-raku/PDF/Tags/Node) subclass and [PDF::COS](https://pdf-raku.github.io/PDF-raku) type are mapped as follows: diff --git a/docs/PDF/Tags/XML-Writer.md b/docs/PDF/Tags/XML-Writer.md index 26749c2..7135a57 100644 --- a/docs/PDF/Tags/XML-Writer.md +++ b/docs/PDF/Tags/XML-Writer.md @@ -12,10 +12,10 @@ Synopsis -------- use PDF::Class; - use PDF::Tags; + use PDF::Tags::Reader; use PDF::Tags::XML-Writer; my PDF::Class $pdf .= open: "t/write-tags.pdf"; - my PDF::Tags $tags .= read: :$pdf; + my PDF::Tags::Reader $tags .= read: :$pdf; my PDF::Tags::XML-Writer $xml-writer .= new: :debug, :root-tag; # atomic write say $xml-writer.Str($tags); diff --git a/docs/index.md b/docs/index.md index 7c2f3c1..d260958 100644 --- a/docs/index.md +++ b/docs/index.md @@ -188,12 +188,10 @@ my PDF::XObject::Image $img .= open: "t/images/lightbulb.gif"; my $figure = $doc.Figure: $gfx, $img, :position[50, 70], :Alt("A light-bulb"); ``` -An [PDF::XObject::Form](https://pdf-raku.github.io/PDF-Class-raku/PDF/XObject/Form) may be associated with a marked content -sub-tree. This is achieved by marking the form against a document fragment, then calling `do` to repeatably -render the form, while inserting the fragment, as demonstrated below: +A [PDF::XObject::Form](https://pdf-raku.github.io/PDF-Class-raku/PDF/XObject/Form) may be associated with a document fragment. The +form can then be rendered, and the fragment inserted into the document, by repeatedly calling `do` on the fragment, as demonstrated below: ```raku - use PDF::Tags; use PDF::Tags::Elem; use PDF::Class; @@ -222,20 +220,11 @@ $page.graphics: -> $gfx { }; } - # multiple rendering of the form, and insertion of its structure tree + # multiple rendering of the form, and insertion into the structure tree $doc.do($gfx, $form-frag, :position[150, 70]); $doc.do($gfx, $form-frag, :position[150, 20]); } - ``` - -To insert an XObject Form that has marked content: - -1. Create a new fragment element. -2. Create the Form XObject, marking content against the fragment -3. The `do` method can then be used to both render and insert -a copy of the fragment into the structure tree. - ### Links Links are usually contained in a block element, such as a `Paragraph`. If diff --git a/lib/PDF/Tags/Mark.rakumod b/lib/PDF/Tags/Mark.rakumod index aea027b..fdf2953 100644 --- a/lib/PDF/Tags/Mark.rakumod +++ b/lib/PDF/Tags/Mark.rakumod @@ -84,13 +84,23 @@ class PDF::Tags::Mark fail "todo: update marked content attributes"; callsame(); } + + sub sanitize(Str $_) { + # actual text sometimes have backspaces, etc? + .subst( + /<[ \x0..\x8 ]>/, + '', + :g + ); + } + method ActualText { $.attributes unless $!atts-built; - $!actual-text //= PDF::COS::TextString.COERCE: $_ + $!actual-text //= sanitize PDF::COS::TextString.COERCE: $_ with %!attributes; $!actual-text; } - method remove-actual-text { + method remove-actual-text is DEPRECATED { with $.ActualText { $!actual-text = Nil; $!value.attributes:delete; diff --git a/lib/PDF/Tags/Node.rakumod b/lib/PDF/Tags/Node.rakumod index cdd5133..f448197 100644 --- a/lib/PDF/Tags/Node.rakumod +++ b/lib/PDF/Tags/Node.rakumod @@ -76,7 +76,7 @@ class PDF::Tags::Node { Returns the underlying L or L object. The L subclass and L type are mapped as follows: =begin table -PDF::Tags::Node object | PDF::Class object |Base class | Notes +PDF::Tags::Node object | PDF::Class object |Base class | Notes ================================================= PDF::Tags | PDF::StructTreeRoot | PDF::Tags::Node::Parent | PDF structure tree root PDF::Tags::Elem | PDF::StructElem | PDF::Tags::Node::Parent | Intermediate structure element node diff --git a/lib/PDF/Tags/Node/Parent.rakumod b/lib/PDF/Tags/Node/Parent.rakumod index 7522023..5608040 100644 --- a/lib/PDF/Tags/Node/Parent.rakumod +++ b/lib/PDF/Tags/Node/Parent.rakumod @@ -16,7 +16,7 @@ class PDF::Tags::Node::Parent has UInt $!elems; has $.style is rw; # Computed CSS style - method elems is also { + method elems(::?CLASS:D:) is also { $!elems //= do with $.cos.kids { when Hash { 1 } default { .elems } diff --git a/lib/PDF/Tags/XML-Writer.rakumod b/lib/PDF/Tags/XML-Writer.rakumod index f37e242..7e34c65 100644 --- a/lib/PDF/Tags/XML-Writer.rakumod +++ b/lib/PDF/Tags/XML-Writer.rakumod @@ -49,16 +49,15 @@ method !chunk(Str $s is copy, UInt $depth = 0) { method !line(|c) { $!feed = True; self!chunk(|c); $!feed = True; } method !frag(|c) { $*inline ?? self!chunk(|c) !! self!line(|c) } -sub html-escape(Str $_) { +sub xml-escape(Str $_) { .trans: /\&/ => '&', /\ '<', /\>/ => '>', - } multi sub str-escape(@a) { @a.map(&str-escape).join: ' '; } multi sub str-escape(Str $_) { - html-escape($_).trans: /\"/ => '"e;'; + xml-escape($_).trans: /\"/ => '"e;'; } multi sub str-escape(Pair $_) { str-escape(.value) } multi sub str-escape($_) is default { str-escape(.Str) } @@ -233,7 +232,7 @@ multi method stream-xml(PDF::Tags::Elem $node, UInt :$depth is copy = 0) { } } - given html-escape($_) { + given xml-escape($_) { my $frag = do { when $omit-tag.so { $_ } when .so { '<%s%s>%s'.sprintf($name, $att, $_, $name) } @@ -287,7 +286,7 @@ multi method stream-xml(PDF::Tags::Mark $node, :$depth!) { multi method stream-xml(PDF::Tags::Text $_, :$depth!) { if .Str -> $text { - self!chunk(html-escape($text), $depth); + self!chunk(xml-escape($text), $depth); } } @@ -297,7 +296,7 @@ method !marked-content(PDF::Tags::Mark $node, :$depth!) { when PDF::Tags::Mark { my $text = self!marked-content($_, :$depth); } - when PDF::Tags::Text { html-escape(.Str) } + when PDF::Tags::Text { xml-escape(.Str) } default { die "unhandled tagged content: {.WHAT.raku}"; } } @text.join; @@ -319,10 +318,10 @@ method !marked-content(PDF::Tags::Mark $node, :$depth!) { =head2 Synopsis use PDF::Class; - use PDF::Tags; + use PDF::Tags::Reader; use PDF::Tags::XML-Writer; my PDF::Class $pdf .= open: "t/write-tags.pdf"; - my PDF::Tags $tags .= read: :$pdf; + my PDF::Tags::Reader $tags .= read: :$pdf; my PDF::Tags::XML-Writer $xml-writer .= new: :debug, :root-tag; # atomic write say $xml-writer.Str($tags);