Skip to content

Commit

Permalink
improve performance, add streaming api
Browse files Browse the repository at this point in the history
  • Loading branch information
aumetra committed Nov 18, 2023
1 parent d69a686 commit a1264fc
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 22 deletions.
8 changes: 8 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ harness = false

[dependencies]
ahash = "0.8.6"
bytecount = { version = "0.6.7", features = ["runtime-dispatch-simd"] }
lol_html = "1.2.0"
once_cell = "1.18.0"
slab = "0.4.9"
thiserror = "1.0.50"

[dev-dependencies]
ammonia = "3.3.0"
Expand Down
87 changes: 70 additions & 17 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,22 @@

use ahash::{AHashMap, AHashSet};
use lol_html::{
errors::RewritingError,
html_content::{Comment, ContentType, DocumentEnd, Element, TextChunk},
DocumentContentHandlers, ElementContentHandlers, HandlerResult, Selector, Settings,
DocumentContentHandlers, ElementContentHandlers, HandlerResult, HtmlRewriter, Selector,
Settings,
};
use once_cell::sync::Lazy;
use slab::Slab;
use std::{borrow::Cow, cell::RefCell, fmt::Write, rc::Rc, str::FromStr};
use std::{
borrow::Cow, cell::RefCell, fmt::Write, iter, rc::Rc, str::FromStr, string::FromUtf8Error,
};
use thiserror::Error;

#[doc(hidden)]
pub use ahash;

pub use lol_html::{errors::RewritingError, MemorySettings};
pub use lol_html::MemorySettings;

mod macros;

Expand All @@ -36,7 +41,10 @@ static SELECT_ALL: Lazy<Selector> = Lazy::new(|| Selector::from_str("*").unwrap(
///
/// See [`BubbleBath::clean`] documentation
#[inline]
pub fn clean(content: &str) -> Result<String, RewritingError> {
pub fn clean<C>(content: C) -> Result<String, Error>
where
C: AsRef<[u8]>,
{
GLOBAL_BUBBLE_BATH.clean(content)
}

Expand Down Expand Up @@ -66,6 +74,19 @@ fn clean_text(source: &str) -> String {
acc
}

/// Potential errors
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum Error {
/// The rewriting of the HTML content failed
#[error(transparent)]
Rewriting(#[from] RewritingError),

/// The bytes were not valid UTF8
#[error(transparent)]
Utf8(#[from] FromUtf8Error),
}

/// HTML sanitizer
///
/// `bubble-bath` is allow-list based, meaning all tags are by default cleaned.
Expand Down Expand Up @@ -249,25 +270,19 @@ impl BubbleBath<'_> {
*chunk.as_mut_str() = clean_text(chunk.as_str());
}

/// Clean the provided HTML content
/// Clean HTML in a streaming fashion
///
/// # Errors
///
/// - The HTML rewriter ran out of memory
/// - The HTML parser ran into an ambiguous state (in this case you should just discard the text instead of trying to fix it)
/// - The name of an attribute you put into the `set_tag_attributes` hashmap is invalid
#[inline]
pub fn clean(&self, content: &str) -> Result<String, RewritingError> {
let mut content = content.to_string();

// Balance out the opening tags
let opening_tags = content.bytes().filter(|b| *b == b'<').count();
let closing_tags = content.bytes().filter(|b| *b == b'>').count();
let closing_tags_iter =
std::iter::repeat('>').take(opening_tags.saturating_sub(closing_tags));

content.extend(closing_tags_iter);

pub fn clean_streaming<'a, I, S>(&self, input: I, sink: S) -> Result<(), Error>
where
I: Iterator<Item = &'a [u8]>,
S: FnMut(&[u8]),
{
let unclosed_tags = Rc::new(RefCell::new(Slab::new()));

let comment_handler = |comment: &mut Comment<'_>| {
Expand Down Expand Up @@ -311,7 +326,45 @@ impl BubbleBath<'_> {
..Settings::default()
};

lol_html::rewrite_str(&content, settings)
let mut opening_tags: usize = 0;
let mut rewriter = HtmlRewriter::new(settings, sink);

for chunk in input {
let tmp_opening_tags = bytecount::count(chunk, b'<');
let tmp_closing_tags = bytecount::count(chunk, b'>');

opening_tags = opening_tags.saturating_add(tmp_opening_tags);
opening_tags = opening_tags.saturating_sub(tmp_closing_tags);

rewriter.write(chunk)?;
}

for _ in 0..opening_tags {
rewriter.write(&[b'>'])?;
}

rewriter.end()?;

Ok(())
}

/// Clean the provided HTML content
///
/// # Errors
///
/// - The output of the HTML transformer was not valid UTF-8
///
/// Check [`Self::clean_streaming`] for additional errors
#[inline]
pub fn clean<C>(&self, content: C) -> Result<String, Error>
where
C: AsRef<[u8]>,
{
let content = content.as_ref();
let mut acc = Vec::with_capacity(content.len());
self.clean_streaming(iter::once(content), |out| acc.extend_from_slice(out))?;

Ok(String::from_utf8(acc)?)
}
}

Expand Down
6 changes: 3 additions & 3 deletions tests/ammonia_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@ use bubble_bath::*;

#[test]
fn deeply_nested_allowlisted() {
clean(&"<b>".repeat(60_000)).unwrap();
clean("<b>".repeat(60_000)).unwrap();
}

#[test]
fn deeply_nested_denylisted() {
clean(&"<b-b>".repeat(60_000)).unwrap();
clean("<b-b>".repeat(60_000)).unwrap();
}

#[test]
fn deeply_nested_alternating() {
clean(&"<b-b>".repeat(35_000)).unwrap();
clean("<b-b>".repeat(35_000)).unwrap();
}

#[test]
Expand Down
4 changes: 2 additions & 2 deletions tests/torture.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::fs;
fn torture() {
insta::glob!("inputs/*", |path| {
let input = fs::read_to_string(path).unwrap();
assert_snapshot!(bubble_bath::clean(&input).unwrap());
assert_snapshot!(bubble_bath::clean(input).unwrap());
});
}

Expand All @@ -19,6 +19,6 @@ fn torture_escaped() {
..BubbleBath::default()
};

assert_snapshot!(bubble_bath.clean(&input).unwrap());
assert_snapshot!(bubble_bath.clean(input).unwrap());
});
}

0 comments on commit a1264fc

Please sign in to comment.