Skip to content

Commit 1f1f72d

Browse files
committed
StreamParser: add fragment parse methods
jhy#2096
1 parent 2b443df commit 1f1f72d

File tree

5 files changed

+171
-27
lines changed

5 files changed

+171
-27
lines changed

src/main/java/org/jsoup/parser/HtmlTreeBuilder.java

+13-14
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public class HtmlTreeBuilder extends TreeBuilder {
5555
private boolean baseUriSetFromDoc;
5656
private @Nullable Element headElement; // the current head element
5757
private @Nullable FormElement formElement; // the current form element
58-
private @Nullable Element contextElement; // fragment parse context -- could be null even if fragment parsing
58+
private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing
5959
private ArrayList<Element> formattingElements; // active (open) formatting elements
6060
private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes
6161
private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out
@@ -94,20 +94,19 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
9494
fragmentParsing = false;
9595
}
9696

97-
@Override List<Node> doParseFragment(@Nullable Element context) {
97+
@Override void initialiseParseFragment(@Nullable Element context) {
9898
// context may be null
9999
state = HtmlTreeBuilderState.Initial;
100-
contextElement = context;
101100
fragmentParsing = true;
102-
Element root = null;
103101

104102
if (context != null) {
103+
final String contextName = context.normalName();
104+
contextElement = new Element(tagFor(contextName, settings), baseUri);
105105
if (context.ownerDocument() != null) // quirks setup:
106106
doc.quirksMode(context.ownerDocument().quirksMode());
107107

108108
// initialise the tokeniser state:
109-
String contextTag = context.normalName();
110-
switch (contextTag) {
109+
switch (contextName) {
111110
case "title":
112111
case "textarea":
113112
tokeniser.transition(TokeniserState.Rcdata);
@@ -132,9 +131,8 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
132131
default:
133132
tokeniser.transition(TokeniserState.Data);
134133
}
135-
root = new Element(tagFor(contextTag, settings), baseUri);
136-
doc.appendChild(root);
137-
push(root);
134+
doc.appendChild(contextElement);
135+
push(contextElement);
138136
resetInsertionMode();
139137

140138
// setup form element to nearest form on context (up ancestor chain). ensures form controls are associated
@@ -148,15 +146,16 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
148146
formSearch = formSearch.parent();
149147
}
150148
}
149+
}
151150

152-
runParser();
153-
if (context != null) {
151+
@Override List<Node> completeParseFragment() {
152+
if (contextElement != null) {
154153
// depending on context and the input html, content may have been added outside of the root el
155154
// e.g. context=p, input=div, the div will have been pushed out.
156-
List<Node> nodes = root.siblingNodes();
155+
List<Node> nodes = contextElement.siblingNodes();
157156
if (!nodes.isEmpty())
158-
root.insertChildren(-1, nodes);
159-
return root.childNodes();
157+
contextElement.insertChildren(-1, nodes);
158+
return contextElement.childNodes();
160159
}
161160
else
162161
return doc.childNodes();

src/main/java/org/jsoup/parser/StreamParser.java

+41-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import java.io.UncheckedIOException;
1818
import java.util.Iterator;
1919
import java.util.LinkedList;
20+
import java.util.List;
2021
import java.util.NoSuchElementException;
2122
import java.util.Queue;
2223
import java.util.Spliterator;
@@ -67,7 +68,7 @@ public StreamParser(Parser parser) {
6768
}
6869

6970
/**
70-
Provide the input for a parse. The input is not read until a consuming operation is called.
71+
Provide the input for a Document parse. The input is not read until a consuming operation is called.
7172
@param input the input to be read.
7273
@param baseUri the URL of this input, for absolute link resolution
7374
@return this parser, for chaining
@@ -81,7 +82,7 @@ public StreamParser parse(Reader input, String baseUri) {
8182
}
8283

8384
/**
84-
Provide the input for a parse. The input is not read until a consuming operation is called.
85+
Provide the input for a Document parse. The input is not read until a consuming operation is called.
8586
@param input the input to be read
8687
@param baseUri the URL of this input, for absolute link resolution
8788
@return this parser
@@ -90,6 +91,32 @@ public StreamParser parse(String input, String baseUri) {
9091
return parse(new StringReader(input), baseUri);
9192
}
9293

94+
/**
95+
Provide the input for a fragment parse. The input is not read until a consuming operation is called.
96+
@param input the input to be read
97+
@param context the optional fragment context element
98+
@param baseUri the URL of this input, for absolute link resolution
99+
@return this parser
100+
@see #completeFragment()
101+
*/
102+
public StreamParser parseFragment(Reader input, @Nullable Element context, String baseUri) {
103+
parse(input, baseUri);
104+
treeBuilder.initialiseParseFragment(context);
105+
return this;
106+
}
107+
108+
/**
109+
Provide the input for a fragment parse. The input is not read until a consuming operation is called.
110+
@param input the input to be read
111+
@param context the optional fragment context element
112+
@param baseUri the URL of this input, for absolute link resolution
113+
@return this parser
114+
@see #completeFragment()
115+
*/
116+
public StreamParser parseFragment(String input, @Nullable Element context, String baseUri) {
117+
return parseFragment(new StringReader(input), context, baseUri);
118+
}
119+
93120
/**
94121
Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each
95122
Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that
@@ -162,6 +189,18 @@ public Document complete() throws IOException {
162189
return doc;
163190
}
164191

192+
/**
193+
When initialized as a fragment parse, runs the parser until the input is fully read, and returns the completed
194+
fragment child nodes.
195+
@return the completed child nodes
196+
@throws IOException if an I/O error occurs
197+
@see #parseFragment(Reader, Element, String)
198+
*/
199+
public List<Node> completeFragment() throws IOException {
200+
treeBuilder.runParser();
201+
return treeBuilder.completeParseFragment();
202+
}
203+
165204
/**
166205
Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the
167206
input will be parsed until the first match is found, or the input is completely read.

src/main/java/org/jsoup/parser/TreeBuilder.java

+9-3
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,16 @@ Document parse(Reader input, String baseUri, Parser parser) {
7878

7979
List<Node> parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) {
8080
initialiseParse(new StringReader(inputFragment), baseUri, parser);
81-
return doParseFragment(context);
81+
initialiseParseFragment(context);
82+
runParser();
83+
return completeParseFragment();
84+
}
85+
86+
void initialiseParseFragment(@Nullable Element context) {
87+
// in Html, sets up context; no-op in XML
8288
}
8389

84-
abstract List<Node> doParseFragment(@Nullable Element context);
90+
abstract List<Node> completeParseFragment();
8591

8692
/** Set the node listener, which will then get callbacks for node insert and removals. */
8793
void nodeListener(NodeVisitor nodeListener) {
@@ -102,7 +108,7 @@ void runParser() {
102108
boolean stepParser() {
103109
// if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks:
104110
if (currentToken.type == Token.TokenType.EOF) {
105-
if (stack.isEmpty()) return false;
111+
if (stack == null || stack.isEmpty()) return false; // stack will be null if TB was closed, as in case of runParser() + completeFragment()
106112
pop();
107113
return true;
108114
}

src/main/java/org/jsoup/parser/XmlTreeBuilder.java

+4-6
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ public class XmlTreeBuilder extends TreeBuilder {
3434
@Override
3535
protected void initialiseParse(Reader input, String baseUri, Parser parser) {
3636
super.initialiseParse(input, baseUri, parser);
37-
stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack). Note not push()ed, so not onNodeInserted.
3837
doc.outputSettings()
3938
.syntax(Document.OutputSettings.Syntax.xml)
4039
.escapeMode(Entities.EscapeMode.xhtml)
@@ -49,6 +48,10 @@ Document parse(String input, String baseUri) {
4948
return parse(new StringReader(input), baseUri, new Parser(this));
5049
}
5150

51+
@Override List<Node> completeParseFragment() {
52+
return doc.childNodes();
53+
}
54+
5255
@Override
5356
XmlTreeBuilder newInstance() {
5457
return new XmlTreeBuilder();
@@ -164,9 +167,4 @@ protected void popStackToClose(Token.EndTag endTag) {
164167
}
165168
}
166169
private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain
167-
168-
@Override List<Node> doParseFragment(@Nullable Element context) {
169-
runParser();
170-
return doc.childNodes();
171-
}
172170
}

src/test/java/org/jsoup/parser/StreamParserTest.java

+104-2
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,18 @@
44
import org.jsoup.integration.ParseTest;
55
import org.jsoup.nodes.Document;
66
import org.jsoup.nodes.Element;
7+
import org.jsoup.nodes.Node;
78
import org.jsoup.select.Elements;
8-
import org.jspecify.annotations.NullMarked;
99
import org.junit.jupiter.api.Test;
1010

1111
import java.io.BufferedReader;
1212
import java.io.File;
13-
import java.io.FileReader;
1413
import java.io.IOException;
1514
import java.io.InputStreamReader;
1615
import java.nio.charset.StandardCharsets;
1716
import java.nio.file.Files;
1817
import java.util.Iterator;
18+
import java.util.List;
1919
import java.util.NoSuchElementException;
2020

2121
import static org.junit.jupiter.api.Assertions.*;
@@ -37,6 +37,18 @@ void canStream() {
3737
}
3838
}
3939

40+
@Test
41+
void canStreamXml() {
42+
String html = "<outmost><DIV id=1>D1</DIV><div id=2>D2<p id=3><span>P One</p><p id=4>P Two</p></div><div id=5>D3<p id=6>P three</p>";
43+
try (StreamParser parser = new StreamParser(Parser.xmlParser()).parse(html, "")) {
44+
StringBuilder seen;
45+
seen = new StringBuilder();
46+
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
47+
assertEquals("DIV#1[D1]+;span[P One];p#3+;p#4[P Two];div#2[D2]+;p#6[P three];div#5[D3];outmost;", seen.toString());
48+
// checks expected order, and the + indicates that element had a next sibling at time of emission
49+
}
50+
}
51+
4052
@Test void canIterate() {
4153
// same as stream, just a different interface
4254
String html = "<title>Test</title></head><div id=1>D1</div><div id=2>D2<p id=3><span>P One</p><p id=4>P Two</p></div><div id=5>D3<p id=6>P three</p>";
@@ -327,4 +339,94 @@ private static CharacterReader getReader(StreamParser streamer) {
327339
// the reader should be closed as streamer is closed on completion of read
328340
assertTrue(isClosed(streamer));
329341
}
342+
343+
// Fragments
344+
345+
@Test
346+
void canStreamFragment() {
347+
String html = "<tr id=1><td>One</td><tr id=2><td>Two</td></tr><tr id=3><td>Three</td></tr>";
348+
Element context = new Element("table");
349+
350+
try (StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) {
351+
StringBuilder seen = new StringBuilder();
352+
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
353+
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString());
354+
// checks expected order, and the + indicates that element had a next sibling at time of emission
355+
// note that we don't get a full doc, just the fragment (and the context at the end of the stack)
356+
357+
assertTrue(isClosed(parser)); // as read to completion
358+
}
359+
}
360+
361+
@Test void canIterateFragment() {
362+
// same as stream, just a different interface
363+
String html = "<tr id=1><td>One</td><tr id=2><td>Two</td></tr><tr id=3><td>Three</td></tr>"; // missing </tr>, following <tr> infers it
364+
Element context = new Element("table");
365+
366+
try(StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) {
367+
StringBuilder seen = new StringBuilder();
368+
369+
Iterator<Element> it = parser.iterator();
370+
while (it.hasNext()) {
371+
trackSeen(it.next(), seen);
372+
}
373+
374+
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;tbody;table;", seen.toString());
375+
// checks expected order, and the + indicates that element had a next sibling at time of emission
376+
// note that we don't get a full doc, just the fragment (and the context at the end of the stack)
377+
378+
assertTrue(isClosed(parser)); // as read to completion
379+
}
380+
}
381+
382+
@Test
383+
void canSelectAndCompleteFragment() throws IOException {
384+
String html = "<tr id=1><td>One</td><tr id=2><td>Two</td></tr><tr id=3><td>Three</td></tr>";
385+
Element context = new Element("table");
386+
387+
try (StreamParser parser = new StreamParser(Parser.htmlParser()).parseFragment(html, context, "")) {
388+
Element first = parser.expectNext("td");
389+
assertEquals("One", first.ownText());
390+
391+
Element el = parser.expectNext("td");
392+
assertEquals("Two", el.ownText());
393+
394+
el = parser.expectNext("td");
395+
assertEquals("Three", el.ownText());
396+
397+
el = parser.selectNext("td");
398+
assertNull(el);
399+
400+
List<Node> nodes = parser.completeFragment();
401+
assertEquals(1, nodes.size()); // should be the inferred tbody
402+
Node tbody = nodes.get(0);
403+
assertEquals("tbody", tbody.nodeName());
404+
List<Node> trs = tbody.childNodes();
405+
assertEquals(3, trs.size()); // should be the three TRs
406+
assertSame(trs.get(0).childNode(0), first); // tr -> td
407+
408+
assertSame(parser.document(), first.ownerDocument()); // the shell document for this fragment
409+
}
410+
}
411+
412+
@Test
413+
void canStreamFragmentXml() throws IOException {
414+
String html = "<tr id=1><td>One</td></tr><tr id=2><td>Two</td></tr><tr id=3><td>Three</td></tr>";
415+
Element context = new Element("Other");
416+
417+
try (StreamParser parser = new StreamParser(Parser.xmlParser()).parseFragment(html, context, "")) {
418+
StringBuilder seen = new StringBuilder();
419+
parser.stream().forEachOrdered(el -> trackSeen(el, seen));
420+
assertEquals("td[One];tr#1+;td[Two];tr#2+;td[Three];tr#3;", seen.toString());
421+
// checks expected order, and the + indicates that element had a next sibling at time of emission
422+
// note that we don't get a full doc, just the fragment
423+
424+
assertTrue(isClosed(parser)); // as read to completion
425+
426+
List<Node> nodes = parser.completeFragment();
427+
assertEquals(3, nodes.size());
428+
assertEquals("tr", nodes.get(0).nodeName());
429+
}
430+
}
431+
330432
}

0 commit comments

Comments
 (0)