From 1f1f72d1e89821c630dcfc35e1a0a7f653cc877b Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Wed, 10 Jan 2024 12:00:20 +1100 Subject: [PATCH] StreamParser: add fragment parse methods #2096 --- .../org/jsoup/parser/HtmlTreeBuilder.java | 27 +++-- .../java/org/jsoup/parser/StreamParser.java | 43 ++++++- .../java/org/jsoup/parser/TreeBuilder.java | 12 +- .../java/org/jsoup/parser/XmlTreeBuilder.java | 10 +- .../org/jsoup/parser/StreamParserTest.java | 106 +++++++++++++++++- 5 files changed, 171 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 13ce4ab8ea..1f49a0bbb2 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -55,7 +55,7 @@ public class HtmlTreeBuilder extends TreeBuilder { private boolean baseUriSetFromDoc; private @Nullable Element headElement; // the current head element private @Nullable FormElement formElement; // the current form element - private @Nullable Element contextElement; // fragment parse context -- could be null even if fragment parsing + private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing private ArrayList formattingElements; // active (open) formatting elements private ArrayList tmplInsertMode; // stack of Template Insertion modes private List pendingTableCharacters; // chars in table to be shifted out @@ -94,20 +94,19 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { fragmentParsing = false; } - @Override List doParseFragment(@Nullable Element context) { + @Override void initialiseParseFragment(@Nullable Element context) { // context may be null state = HtmlTreeBuilderState.Initial; - contextElement = context; fragmentParsing = true; - Element root = null; if (context != null) { + final String contextName = context.normalName(); + contextElement = new Element(tagFor(contextName, settings), baseUri); if (context.ownerDocument() != null) // quirks setup: doc.quirksMode(context.ownerDocument().quirksMode()); // initialise the tokeniser state: - String contextTag = context.normalName(); - switch (contextTag) { + switch (contextName) { case "title": case "textarea": tokeniser.transition(TokeniserState.Rcdata); @@ -132,9 +131,8 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { default: tokeniser.transition(TokeniserState.Data); } - root = new Element(tagFor(contextTag, settings), baseUri); - doc.appendChild(root); - push(root); + doc.appendChild(contextElement); + push(contextElement); resetInsertionMode(); // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated @@ -148,15 +146,16 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { formSearch = formSearch.parent(); } } + } - runParser(); - if (context != null) { + @Override List completeParseFragment() { + if (contextElement != null) { // depending on context and the input html, content may have been added outside of the root el // e.g. context=p, input=div, the div will have been pushed out. - List nodes = root.siblingNodes(); + List nodes = contextElement.siblingNodes(); if (!nodes.isEmpty()) - root.insertChildren(-1, nodes); - return root.childNodes(); + contextElement.insertChildren(-1, nodes); + return contextElement.childNodes(); } else return doc.childNodes(); diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java index 8d8aae8038..ba0078f8ad 100644 --- a/src/main/java/org/jsoup/parser/StreamParser.java +++ b/src/main/java/org/jsoup/parser/StreamParser.java @@ -17,6 +17,7 @@ import java.io.UncheckedIOException; import java.util.Iterator; import java.util.LinkedList; +import java.util.List; import java.util.NoSuchElementException; import java.util.Queue; import java.util.Spliterator; @@ -67,7 +68,7 @@ public StreamParser(Parser parser) { } /** - Provide the input for a parse. The input is not read until a consuming operation is called. + Provide the input for a Document parse. The input is not read until a consuming operation is called. @param input the input to be read. @param baseUri the URL of this input, for absolute link resolution @return this parser, for chaining @@ -81,7 +82,7 @@ public StreamParser parse(Reader input, String baseUri) { } /** - Provide the input for a parse. The input is not read until a consuming operation is called. + Provide the input for a Document parse. The input is not read until a consuming operation is called. @param input the input to be read @param baseUri the URL of this input, for absolute link resolution @return this parser @@ -90,6 +91,32 @@ public StreamParser parse(String input, String baseUri) { return parse(new StringReader(input), baseUri); } + /** + Provide the input for a fragment parse. The input is not read until a consuming operation is called. + @param input the input to be read + @param context the optional fragment context element + @param baseUri the URL of this input, for absolute link resolution + @return this parser + @see #completeFragment() + */ + public StreamParser parseFragment(Reader input, @Nullable Element context, String baseUri) { + parse(input, baseUri); + treeBuilder.initialiseParseFragment(context); + return this; + } + + /** + Provide the input for a fragment parse. The input is not read until a consuming operation is called. + @param input the input to be read + @param context the optional fragment context element + @param baseUri the URL of this input, for absolute link resolution + @return this parser + @see #completeFragment() + */ + public StreamParser parseFragment(String input, @Nullable Element context, String baseUri) { + return parseFragment(new StringReader(input), context, baseUri); + } + /** Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that @@ -162,6 +189,18 @@ public Document complete() throws IOException { return doc; } + /** + When initialized as a fragment parse, runs the parser until the input is fully read, and returns the completed + fragment child nodes. + @return the completed child nodes + @throws IOException if an I/O error occurs + @see #parseFragment(Reader, Element, String) + */ + public List completeFragment() throws IOException { + treeBuilder.runParser(); + return treeBuilder.completeParseFragment(); + } + /** Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the input will be parsed until the first match is found, or the input is completely read. diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 8c755f3d5a..fb5c0708fa 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -78,10 +78,16 @@ Document parse(Reader input, String baseUri, Parser parser) { List parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { initialiseParse(new StringReader(inputFragment), baseUri, parser); - return doParseFragment(context); + initialiseParseFragment(context); + runParser(); + return completeParseFragment(); + } + + void initialiseParseFragment(@Nullable Element context) { + // in Html, sets up context; no-op in XML } - abstract List doParseFragment(@Nullable Element context); + abstract List completeParseFragment(); /** Set the node listener, which will then get callbacks for node insert and removals. */ void nodeListener(NodeVisitor nodeListener) { @@ -102,7 +108,7 @@ void runParser() { boolean stepParser() { // if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks: if (currentToken.type == Token.TokenType.EOF) { - if (stack.isEmpty()) return false; + if (stack == null || stack.isEmpty()) return false; // stack will be null if TB was closed, as in case of runParser() + completeFragment() pop(); return true; } diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java index 51325e7e7f..34bdf35f9a 100644 --- a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java @@ -34,7 +34,6 @@ public class XmlTreeBuilder extends TreeBuilder { @Override protected void initialiseParse(Reader input, String baseUri, Parser parser) { super.initialiseParse(input, baseUri, parser); - stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack). Note not push()ed, so not onNodeInserted. doc.outputSettings() .syntax(Document.OutputSettings.Syntax.xml) .escapeMode(Entities.EscapeMode.xhtml) @@ -49,6 +48,10 @@ Document parse(String input, String baseUri) { return parse(new StringReader(input), baseUri, new Parser(this)); } + @Override List completeParseFragment() { + return doc.childNodes(); + } + @Override XmlTreeBuilder newInstance() { return new XmlTreeBuilder(); @@ -164,9 +167,4 @@ protected void popStackToClose(Token.EndTag endTag) { } } private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain - - @Override List doParseFragment(@Nullable Element context) { - runParser(); - return doc.childNodes(); - } } diff --git a/src/test/java/org/jsoup/parser/StreamParserTest.java b/src/test/java/org/jsoup/parser/StreamParserTest.java index b9957fc96b..f81bdb0ffc 100644 --- a/src/test/java/org/jsoup/parser/StreamParserTest.java +++ b/src/test/java/org/jsoup/parser/StreamParserTest.java @@ -4,18 +4,18 @@ import org.jsoup.integration.ParseTest; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; import org.jsoup.select.Elements; -import org.jspecify.annotations.NullMarked; import org.junit.jupiter.api.Test; import java.io.BufferedReader; import java.io.File; -import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.Iterator; +import java.util.List; import java.util.NoSuchElementException; import static org.junit.jupiter.api.Assertions.*; @@ -37,6 +37,18 @@ void canStream() { } } + @Test + void canStreamXml() { + String html = "
D1
D2

P One

P Two

D3

P three

"; + try (StreamParser parser = new StreamParser(Parser.xmlParser()).parse(html, "")) { + StringBuilder seen; + seen = new StringBuilder(); + parser.stream().forEachOrdered(el -> trackSeen(el, seen)); + assertEquals("DIV#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];outmost;", seen.toString()); + // checks expected order, and the + indicates that element had a next sibling at time of emission + } + } + @Test void canIterate() { // same as stream, just a different interface String html = "Test
D1
D2

P One

P Two

D3

P three

"; @@ -327,4 +339,94 @@ private static CharacterReader getReader(StreamParser streamer) { // the reader should be closed as streamer is closed on completion of read assertTrue(isClosed(streamer)); } + + // Fragments + + @Test + void canStreamFragment() { + String html = "OneTwoThree"; + Element context = new Element("table"); + + try (StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) { + StringBuilder seen = new StringBuilder(); + parser.stream().forEachOrdered(el -> trackSeen(el, seen)); + assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString()); + // checks expected order, and the + indicates that element had a next sibling at time of emission + // note that we don't get a full doc, just the fragment (and the context at the end of the stack) + + assertTrue(isClosed(parser)); // as read to completion + } + } + + @Test void canIterateFragment() { + // same as stream, just a different interface + String html = "OneTwoThree"; // missing , following infers it + Element context = new Element("table"); + + try(StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) { + StringBuilder seen = new StringBuilder(); + + Iterator it = parser.iterator(); + while (it.hasNext()) { + trackSeen(it.next(), seen); + } + + assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString()); + // checks expected order, and the + indicates that element had a next sibling at time of emission + // note that we don't get a full doc, just the fragment (and the context at the end of the stack) + + assertTrue(isClosed(parser)); // as read to completion + } + } + + @Test + void canSelectAndCompleteFragment() throws IOException { + String html = "OneTwoThree"; + Element context = new Element("table"); + + try (StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) { + Element first = parser.expectNext("td"); + assertEquals("One", first.ownText()); + + Element el = parser.expectNext("td"); + assertEquals("Two", el.ownText()); + + el = parser.expectNext("td"); + assertEquals("Three", el.ownText()); + + el = parser.selectNext("td"); + assertNull(el); + + List nodes = parser.completeFragment(); + assertEquals(1, nodes.size()); // should be the inferred tbody + Node tbody = nodes.get(0); + assertEquals("tbody", tbody.nodeName()); + List trs = tbody.childNodes(); + assertEquals(3, trs.size()); // should be the three TRs + assertSame(trs.get(0).childNode(0), first); // tr -> td + + assertSame(parser.document(), first.ownerDocument()); // the shell document for this fragment + } + } + + @Test + void canStreamFragmentXml() throws IOException { + String html = "OneTwoThree"; + Element context = new Element("Other"); + + try (StreamParser parser = new StreamParser(Parser.xmlParser()).parseFragment(html, context, "")) { + StringBuilder seen = new StringBuilder(); + parser.stream().forEachOrdered(el -> trackSeen(el, seen)); + assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;", seen.toString()); + // checks expected order, and the + indicates that element had a next sibling at time of emission + // note that we don't get a full doc, just the fragment + + assertTrue(isClosed(parser)); // as read to completion + + List nodes = parser.completeFragment(); + assertEquals(3, nodes.size()); + assertEquals("tr", nodes.get(0).nodeName()); + } + } + }