Skip to content

Commit

Permalink
Merge pull request #221 from rust-scraper/sorted-vec-instead-of-hash-…
Browse files Browse the repository at this point in the history
…table

RFC: Drop hash table for per-element attributes for more compact sorted vector
  • Loading branch information
LoZack19 authored Dec 2, 2024
2 parents 8d3e74b + ee66ee8 commit 26f04ed
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 63 deletions.
72 changes: 26 additions & 46 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions scraper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ repository = "https://github.com/causal-agent/scraper"
readme = "README.md"

[dependencies]
ahash = "0.8.0"
cssparser = "0.34.0"
ego-tree = "0.9.0"
html5ever = "0.29.0"
indexmap = { version = "2.6.0", optional = true }
indexmap = { version = "2.7.0", optional = true }
precomputed-hash = "0.1.1"
selectors = "0.26.0"
tendril = "0.4.3"
Expand Down
11 changes: 11 additions & 0 deletions scraper/src/html/tree_sink.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,17 @@ impl TreeSink for HtmlTreeSink {
};

for attr in attrs {
#[cfg(not(feature = "deterministic"))]
if let Err(idx) = element
.attrs
.binary_search_by(|(name, _)| name.cmp(&attr.name))
{
element
.attrs
.insert(idx, (attr.name, make_tendril(attr.value)));
}

#[cfg(feature = "deterministic")]
element
.attrs
.entry(attr.name)
Expand Down
41 changes: 26 additions & 15 deletions scraper/src/node.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
//! HTML nodes.
#[cfg(not(feature = "deterministic"))]
use ahash::AHashMap as HashMap;
#[cfg(not(feature = "deterministic"))]
use std::collections::hash_map;
use std::fmt;
use std::ops::Deref;
use std::slice::Iter as SliceIter;
Expand Down Expand Up @@ -219,7 +215,7 @@ pub type Attributes = indexmap::IndexMap<QualName, StrTendril>;
/// Please enable the `deterministic` feature for order-preserving
/// (de)serialization.
#[cfg(not(feature = "deterministic"))]
pub type Attributes = HashMap<QualName, StrTendril>;
pub type Attributes = Vec<(QualName, StrTendril)>;

/// An HTML element.
#[derive(Clone, PartialEq, Eq)]
Expand All @@ -232,16 +228,20 @@ pub struct Element {

id: OnceCell<Option<StrTendril>>,

classes: OnceCell<Vec<LocalName>>,
classes: OnceCell<Box<[LocalName]>>,
}

impl Element {
#[doc(hidden)]
pub fn new(name: QualName, attributes: Vec<Attribute>) -> Self {
let attrs = attributes
#[allow(unused_mut)]
let mut attrs = attributes
.into_iter()
.map(|a| (a.name, crate::tendril_util::make(a.value)))
.collect();
.map(|attr| (attr.name, crate::tendril_util::make(attr.value)))
.collect::<Attributes>();

#[cfg(not(feature = "deterministic"))]
attrs.sort_unstable_by(|lhs, rhs| lhs.0.cmp(&rhs.0));

Element {
attrs,
Expand Down Expand Up @@ -277,17 +277,17 @@ impl Element {
/// Returns an iterator over the element's classes.
pub fn classes(&self) -> Classes {
let classes = self.classes.get_or_init(|| {
let mut classes: Vec<LocalName> = self
let mut classes = self
.attrs
.iter()
.filter(|(name, _)| name.local.as_ref() == "class")
.flat_map(|(_, value)| value.split_whitespace().map(LocalName::from))
.collect();
.flat_map(|(_, value)| value.split_ascii_whitespace().map(LocalName::from))
.collect::<Vec<_>>();

classes.sort_unstable();
classes.dedup();

classes
classes.into_boxed_slice()
});

Classes {
Expand All @@ -298,7 +298,18 @@ impl Element {
/// Returns the value of an attribute.
pub fn attr(&self, attr: &str) -> Option<&str> {
let qualname = QualName::new(None, ns!(), LocalName::from(attr));
self.attrs.get(&qualname).map(Deref::deref)

#[cfg(not(feature = "deterministic"))]
let value = self
.attrs
.binary_search_by(|attr| attr.0.cmp(&qualname))
.ok()
.map(|idx| &*self.attrs[idx].1);

#[cfg(feature = "deterministic")]
let value = self.attrs.get(&qualname).map(Deref::deref);

value
}

/// Returns an iterator over the element's attributes.
Expand Down Expand Up @@ -330,7 +341,7 @@ pub type AttributesIter<'a> = indexmap::map::Iter<'a, QualName, StrTendril>;

/// An iterator over a node's attributes.
#[cfg(not(feature = "deterministic"))]
pub type AttributesIter<'a> = hash_map::Iter<'a, QualName, StrTendril>;
pub type AttributesIter<'a> = SliceIter<'a, (QualName, StrTendril)>;

/// Iterator over attributes.
#[allow(missing_debug_implementations)]
Expand Down

0 comments on commit 26f04ed

Please sign in to comment.