Skip to content

Commit ab877f2

Browse files
spicychickensaucespicychickensaucejonatanklosko
authored
Implement parent_nodes + nth_child (#25)
Co-authored-by: spicychickensauce <[email protected]> Co-authored-by: Jonatan Kłosko <[email protected]>
1 parent 7d43d42 commit ab877f2

File tree

4 files changed

+200
-4
lines changed

4 files changed

+200
-4
lines changed

c_src/lazy_html.cpp

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <stdexcept>
88
#include <string>
99
#include <tuple>
10+
#include <unordered_set>
1011
#include <variant>
1112

1213
#include <lexbor/html/html.h>
@@ -43,8 +44,10 @@ auto resource = fine::Atom("resource");
4344

4445
struct DocumentRef {
4546
lxb_html_document_t *document;
47+
bool is_fragment;
4648

47-
DocumentRef(lxb_html_document_t *document) : document(document) {}
49+
DocumentRef(lxb_html_document_t *document, bool is_fragment)
50+
: document(document), is_fragment(is_fragment) {}
4851

4952
~DocumentRef() { lxb_html_document_destroy(this->document); }
5053
};
@@ -97,7 +100,7 @@ ExLazyHTML from_document(ErlNifEnv *env, ErlNifBinary html) {
97100
throw std::runtime_error("failed to parse html document");
98101
}
99102

100-
auto document_ref = std::make_shared<DocumentRef>(document);
103+
auto document_ref = std::make_shared<DocumentRef>(document, false);
101104
document_guard.deactivate();
102105

103106
auto nodes = std::vector<lxb_dom_node_t *>();
@@ -129,7 +132,7 @@ ExLazyHTML from_fragment(ErlNifEnv *env, ErlNifBinary html) {
129132
throw std::runtime_error("failed to parse html fragment");
130133
}
131134

132-
auto document_ref = std::make_shared<DocumentRef>(document);
135+
auto document_ref = std::make_shared<DocumentRef>(document, true);
133136
document_guard.deactivate();
134137

135138
auto nodes = std::vector<lxb_dom_node_t *>();
@@ -522,7 +525,10 @@ ExLazyHTML from_tree(ErlNifEnv *env, std::vector<fine::Term> tree) {
522525
nodes.push_back(node);
523526
}
524527

525-
auto document_ref = std::make_shared<DocumentRef>(document);
528+
bool is_fragment =
529+
nodes.empty() || !lxb_html_tree_node_is(nodes.front(), LXB_TAG_HTML);
530+
531+
auto document_ref = std::make_shared<DocumentRef>(document, is_fragment);
526532
document_guard.deactivate();
527533

528534
return ExLazyHTML(fine::make_resource<LazyHTML>(document_ref, nodes, false));
@@ -714,6 +720,57 @@ ExLazyHTML child_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
714720

715721
FINE_NIF(child_nodes, 0);
716722

723+
ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
724+
bool is_document = !ex_lazy_html.resource->document_ref->is_fragment;
725+
auto nodes = std::vector<lxb_dom_node_t *>();
726+
auto inserted_nodes = std::unordered_set<lxb_dom_node_t *>();
727+
728+
for (auto node : ex_lazy_html.resource->nodes) {
729+
auto parent = lxb_dom_node_parent(node);
730+
if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT &&
731+
(is_document || !lxb_html_tree_node_is(parent, LXB_TAG_HTML))) {
732+
auto inserted_node = inserted_nodes.find(parent);
733+
if (inserted_node == inserted_nodes.end()) {
734+
inserted_nodes.insert(parent);
735+
nodes.push_back(parent);
736+
}
737+
}
738+
}
739+
return ExLazyHTML(fine::make_resource<LazyHTML>(
740+
ex_lazy_html.resource->document_ref, nodes, true));
741+
}
742+
FINE_NIF(parent_node, ERL_NIF_DIRTY_JOB_CPU_BOUND);
743+
744+
std::vector<int64_t> nth_child(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
745+
auto values = std::vector<int64_t>();
746+
for (auto node : ex_lazy_html.resource->nodes) {
747+
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
748+
continue;
749+
}
750+
751+
auto parent = lxb_dom_node_parent(node);
752+
if (parent == NULL) {
753+
// We're at the root, nth_child is 1
754+
values.push_back(1);
755+
} else {
756+
int64_t i = 1;
757+
for (auto child = lxb_dom_node_first_child(parent); child != NULL;
758+
child = lxb_dom_node_next(child)) {
759+
if (child == node) {
760+
break;
761+
}
762+
if (child->type == LXB_DOM_NODE_TYPE_ELEMENT) {
763+
i++;
764+
}
765+
}
766+
values.push_back(i);
767+
}
768+
}
769+
770+
return values;
771+
}
772+
FINE_NIF(nth_child, ERL_NIF_DIRTY_JOB_CPU_BOUND);
773+
717774
std::string text(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
718775
auto document = ex_lazy_html.resource->document_ref->document;
719776

lib/lazy_html.ex

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,48 @@ defmodule LazyHTML do
357357
LazyHTML.NIF.child_nodes(lazy_html)
358358
end
359359

360+
@doc """
361+
Returns the (unique) parent nodes of the root nodes in `lazy_html`.
362+
363+
## Examples
364+
365+
iex> lazy_html = LazyHTML.from_fragment(~S|<div><span>Hello</span> <span>world</span></div>|)
366+
iex> spans = LazyHTML.query(lazy_html, "span")
367+
iex> LazyHTML.parent_node(spans)
368+
#LazyHTML<
369+
1 node (from selector)
370+
#1
371+
<div><span>Hello</span> <span>world</span></div>
372+
>
373+
374+
"""
375+
@spec parent_node(t()) :: t()
376+
def parent_node(lazy_html) do
377+
LazyHTML.NIF.parent_node(lazy_html)
378+
end
379+
380+
@doc """
381+
Returns the position among its siblings for every root element in `lazy_html`.
382+
383+
The position numbering is 1-based and only considers siblings that
384+
are elements, as to match the `:nth-child` CSS pseudo-class.
385+
386+
Note that if there are text or comment root nodes, they are ignored,
387+
and they have no corresponding number in the result.
388+
389+
## Examples
390+
391+
iex> lazy_html = LazyHTML.from_fragment(~S|<div><span>1</span><span>2</span></div>|)
392+
iex> spans = LazyHTML.query(lazy_html, "span")
393+
iex> LazyHTML.nth_child(spans)
394+
[1, 2]
395+
396+
"""
397+
@spec nth_child(t()) :: list(integer())
398+
def nth_child(lazy_html) do
399+
LazyHTML.NIF.nth_child(lazy_html)
400+
end
401+
360402
@doc """
361403
Returns the text content of all nodes in `lazy_html`.
362404

lib/lazy_html/nif.ex

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ defmodule LazyHTML.NIF do
2121
def filter(_lazy_html, _css_selector), do: err!()
2222
def query_by_id(_lazy_html, _id), do: err!()
2323
def child_nodes(_lazy_html), do: err!()
24+
def parent_node(_lazy_html), do: err!()
25+
def nth_child(_lazy_html), do: err!()
2426
def text(_lazy_html), do: err!()
2527
def attribute(_lazy_html, _name), do: err!()
2628
def attributes(_lazy_html), do: err!()

test/lazy_html_test.exs

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,101 @@ defmodule LazyHTMLTest do
250250
end
251251
end
252252

253+
describe "parent_node/1" do
254+
test "from selector of nodes on different levels" do
255+
lazy_html =
256+
LazyHTML.from_fragment("""
257+
<div id="a">
258+
<div id="b">
259+
<span>Hello</span>
260+
</div>
261+
<span>world</span>
262+
</div>
263+
""")
264+
265+
spans = LazyHTML.query(lazy_html, "span")
266+
parents = LazyHTML.parent_node(spans)
267+
parent_ids = parents |> LazyHTML.attribute("id") |> Enum.sort()
268+
assert parent_ids == ["a", "b"]
269+
270+
# parent of div#id="a" is null
271+
grandparents = LazyHTML.parent_node(parents)
272+
assert LazyHTML.tag(grandparents) == ["div"]
273+
274+
great_grandparents = LazyHTML.parent_node(grandparents)
275+
assert great_grandparents |> Enum.count() == 0
276+
end
277+
278+
test "from selector of nodes on same level" do
279+
lazy_html =
280+
LazyHTML.from_fragment("""
281+
<div id="a">
282+
<div id="b">
283+
<span>Hello</span>
284+
</div>
285+
<div id="c">
286+
<span>world</span>
287+
</div>
288+
</div>
289+
""")
290+
291+
spans = LazyHTML.query(lazy_html, "span")
292+
parents = LazyHTML.parent_node(spans)
293+
parent_ids = parents |> LazyHTML.attribute("id") |> Enum.sort()
294+
assert parent_ids == ["b", "c"]
295+
296+
# since they share the same parent, we now only have one node left
297+
grandparent = LazyHTML.parent_node(parents)
298+
assert LazyHTML.attribute(grandparent, "id") == ["a"]
299+
end
300+
301+
defp ancestor_chain(node) do
302+
parent = LazyHTML.parent_node(node)
303+
304+
if Enum.count(node) == 0 do
305+
[]
306+
else
307+
ancestor_chain(parent) ++ LazyHTML.tag(parent)
308+
end
309+
end
310+
311+
test "last parent node is <html> if instantiated via from_document and similar" do
312+
lazy_html = LazyHTML.from_document("<html><body><div>root</div></body></html>")
313+
assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body"]
314+
315+
lazy_html = LazyHTML.from_fragment("<div>root</div>")
316+
assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == []
317+
318+
lazy_html = LazyHTML.from_tree([{"div", [], []}])
319+
assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == []
320+
321+
lazy_html = LazyHTML.from_tree([{"html", [], [{"body", [], [{"div", [], []}]}]}])
322+
assert lazy_html |> LazyHTML.query("div") |> ancestor_chain() == ["html", "body"]
323+
end
324+
end
325+
326+
describe "nth_child/1" do
327+
test "nth_child gives position" do
328+
lazy_html =
329+
LazyHTML.from_fragment("""
330+
<div>
331+
Text isn't counted.
332+
<span>1</span>
333+
<!-- neither are comments -->
334+
<span>2</span>
335+
</div>
336+
""")
337+
338+
assert LazyHTML.nth_child(lazy_html) == [1]
339+
assert lazy_html["div"] |> LazyHTML.nth_child() == [1]
340+
assert lazy_html["span"] |> LazyHTML.nth_child() == [1, 2]
341+
342+
# Verify numbering matches css selector
343+
assert lazy_html["span:nth-child(1)"] |> LazyHTML.text() == "1"
344+
assert lazy_html["span:nth-child(2)"] |> LazyHTML.text() == "2"
345+
end
346+
end
347+
253348
describe "query_by_id/2" do
254349
test "raises when an empty id is given" do
255350
assert_raise ArgumentError, ~r/id cannot be empty/, fn ->

0 commit comments

Comments
 (0)