Skip to content

Commit 6cb1b83

Browse files
committed
feat(path): allow to config parser to allow non compliant rfc3986 support
1 parent ab76284 commit 6cb1b83

File tree

6 files changed

+139
-22
lines changed

6 files changed

+139
-22
lines changed

benches/parse.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ fn uri(c: &mut Criterion) {
113113
.throughput(Throughput::Bytes(input.len() as u64))
114114
.bench_function(name, |b| b.iter(|| {
115115
let mut b = httparse::_benchable::Bytes::new(black_box(input));
116-
httparse::_benchable::parse_uri(&mut b).unwrap()
116+
httparse::_benchable::parse_uri(&mut b, false).unwrap()
117117
}));
118118
}
119119

src/lib.rs

+57-5
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,32 @@ static URI_MAP: [bool; 256] = byte_map![
9090
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9191
];
9292

93+
static URI_NON_COMPLIANT_MAP: [bool; 256] = byte_map![
94+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96+
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
97+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
98+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
99+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
100+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
101+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
102+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110+
];
111+
93112
#[inline]
94-
pub(crate) fn is_uri_token(b: u8) -> bool {
95-
URI_MAP[b as usize]
113+
pub(crate) fn is_uri_token(b: u8, allow_non_compliant: bool) -> bool {
114+
if allow_non_compliant {
115+
URI_NON_COMPLIANT_MAP[b as usize]
116+
} else {
117+
URI_MAP[b as usize]
118+
}
96119
}
97120

98121
static HEADER_NAME_MAP: [bool; 256] = byte_map![
@@ -260,6 +283,7 @@ pub struct ParserConfig {
260283
allow_multiple_spaces_in_request_line_delimiters: bool,
261284
allow_multiple_spaces_in_response_status_delimiters: bool,
262285
allow_space_before_first_header_name: bool,
286+
allow_rfc3986_non_compliant_path: bool,
263287
ignore_invalid_headers_in_responses: bool,
264288
ignore_invalid_headers_in_requests: bool,
265289
}
@@ -539,7 +563,7 @@ impl<'h, 'b> Request<'h, 'b> {
539563
if config.allow_multiple_spaces_in_request_line_delimiters {
540564
complete!(skip_spaces(&mut bytes));
541565
}
542-
self.path = Some(complete!(parse_uri(&mut bytes)));
566+
self.path = Some(complete!(parse_uri(&mut bytes, config.allow_rfc3986_non_compliant_path)));
543567
if config.allow_multiple_spaces_in_request_line_delimiters {
544568
complete!(skip_spaces(&mut bytes));
545569
}
@@ -952,9 +976,9 @@ fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
952976
#[doc(hidden)]
953977
#[allow(missing_docs)]
954978
// WARNING: Exported for internal benchmarks, not fit for public consumption
955-
pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
979+
pub fn parse_uri<'a>(bytes: &mut Bytes<'a>, allow_non_compliant: bool) -> Result<&'a str> {
956980
let start = bytes.pos();
957-
simd::match_uri_vectored(bytes);
981+
simd::match_uri_vectored(bytes, allow_non_compliant);
958982
let end = bytes.pos();
959983

960984
if next!(bytes) == b' ' {
@@ -2676,4 +2700,32 @@ mod tests {
26762700
assert_eq!(response.headers[0].name, "foo");
26772701
assert_eq!(response.headers[0].value, &b"bar"[..]);
26782702
}
2703+
2704+
#[test]
2705+
fn test_rfc3986_non_compliant_path_ko() {
2706+
let mut headers = [EMPTY_HEADER; 1];
2707+
let mut request = Request::new(&mut headers[..]);
2708+
2709+
let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");
2710+
2711+
assert_eq!(result, Err(crate::Error::Token));
2712+
}
2713+
2714+
#[test]
2715+
fn test_rfc3986_non_compliant_path_ok() {
2716+
let mut headers = [EMPTY_HEADER; 1];
2717+
let mut request = Request::new(&mut headers[..]);
2718+
let mut config = crate::ParserConfig::default();
2719+
config.allow_rfc3986_non_compliant_path = true;
2720+
2721+
let result = config.parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");
2722+
2723+
assert_eq!(result, Ok(Status::Complete(67)));
2724+
assert_eq!(request.version.unwrap(), 1);
2725+
assert_eq!(request.method.unwrap(), "GET");
2726+
assert_eq!(request.path.unwrap(), "/test?post=I’msorryIforkedyou");
2727+
assert_eq!(request.headers.len(), 1);
2728+
assert_eq!(request.headers[0].name, "Host");
2729+
assert_eq!(request.headers[0].value, &b"example.org"[..]);
2730+
}
26792731
}

src/simd/avx2.rs

+38-5
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,23 @@ use crate::iter::Bytes;
22

33
#[inline]
44
#[target_feature(enable = "avx2", enable = "sse4.2")]
5-
pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
5+
pub unsafe fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
66
while bytes.as_ref().len() >= 32 {
7-
let advance = match_url_char_32_avx(bytes.as_ref());
7+
8+
let advance = if allow_non_compliant {
9+
match_url_char_non_compliant_32_avx(bytes.as_ref())
10+
} else {
11+
match_url_char_32_avx(bytes.as_ref())
12+
};
13+
814
bytes.advance(advance);
915

1016
if advance != 32 {
1117
return;
1218
}
1319
}
1420
// do both, since avx2 only works when bytes.len() >= 32
15-
super::sse42::match_uri_vectored(bytes)
21+
super::sse42::match_uri_vectored(bytes, allow_non_compliant)
1622
}
1723

1824
#[inline(always)]
@@ -56,6 +62,33 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
5662
r.trailing_zeros() as usize
5763
}
5864

65+
#[inline(always)]
66+
#[allow(non_snake_case, overflowing_literals)]
67+
#[allow(unused)]
68+
unsafe fn match_url_char_non_compliant_32_avx(buf: &[u8]) -> usize {
69+
debug_assert!(buf.len() >= 32);
70+
71+
#[cfg(target_arch = "x86")]
72+
use core::arch::x86::*;
73+
#[cfg(target_arch = "x86_64")]
74+
use core::arch::x86_64::*;
75+
76+
let ptr = buf.as_ptr();
77+
78+
// %x21-%x7e %x80-%xff
79+
let DEL: __m256i = _mm256_set1_epi8(0x7f);
80+
let LOW: __m256i = _mm256_set1_epi8(0x21);
81+
82+
let dat = _mm256_lddqu_si256(ptr as *const _);
83+
// unsigned comparison dat >= LOW
84+
let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat);
85+
let del = _mm256_cmpeq_epi8(dat, DEL);
86+
let bit = _mm256_andnot_si256(del, low);
87+
let res = _mm256_movemask_epi8(bit) as u32;
88+
// TODO: use .trailing_ones() once MSRV >= 1.46
89+
(!res).trailing_zeros() as usize
90+
}
91+
5992
#[target_feature(enable = "avx2", enable = "sse4.2")]
6093
pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {
6194
while bytes.as_ref().len() >= 32 {
@@ -107,11 +140,11 @@ fn avx2_code_matches_uri_chars_table() {
107140

108141
#[allow(clippy::undocumented_unsafe_blocks)]
109142
unsafe {
110-
assert!(byte_is_allowed(b'_', match_uri_vectored));
143+
assert!(byte_is_allowed(b'_', |b| match_uri_vectored(b, false)));
111144

112145
for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
113146
assert_eq!(
114-
byte_is_allowed(b as u8, match_uri_vectored), allowed,
147+
byte_is_allowed(b as u8, |b| match_uri_vectored(b, false)), allowed,
115148
"byte_is_allowed({:?}) should be {:?}", b, allowed,
116149
);
117150
}

src/simd/runtime.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,13 @@ pub fn match_header_name_vectored(bytes: &mut Bytes) {
3434
super::swar::match_header_name_vectored(bytes);
3535
}
3636

37-
pub fn match_uri_vectored(bytes: &mut Bytes) {
37+
pub fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
3838
// SAFETY: calls are guarded by a feature check
3939
unsafe {
4040
match get_runtime_feature() {
41-
AVX2 => avx2::match_uri_vectored(bytes),
42-
SSE42 => sse42::match_uri_vectored(bytes),
43-
_ /* NOP */ => super::swar::match_uri_vectored(bytes),
41+
AVX2 => avx2::match_uri_vectored(bytes, allow_non_compliant),
42+
SSE42 => sse42::match_uri_vectored(bytes, allow_non_compliant),
43+
_ /* NOP */ => super::swar::match_uri_vectored(bytes, allow_non_compliant),
4444
}
4545
}
4646
}

src/simd/sse42.rs

+37-5
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
use crate::iter::Bytes;
22

33
#[target_feature(enable = "sse4.2")]
4-
pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
4+
pub unsafe fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
55
while bytes.as_ref().len() >= 16 {
6-
let advance = match_url_char_16_sse(bytes.as_ref());
6+
let advance = if allow_non_compliant {
7+
match_url_char_non_compliant_16_sse(bytes.as_ref())
8+
} else {
9+
match_url_char_16_sse(bytes.as_ref())
10+
};
11+
712
bytes.advance(advance);
813

914
if advance != 16 {
1015
return;
1116
}
1217
}
13-
super::swar::match_uri_vectored(bytes);
18+
super::swar::match_uri_vectored(bytes, allow_non_compliant);
1419
}
1520

1621
#[inline(always)]
@@ -61,6 +66,33 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
6166
r.trailing_zeros() as usize
6267
}
6368

69+
#[inline(always)]
70+
#[allow(non_snake_case)]
71+
unsafe fn match_url_char_non_compliant_16_sse(buf: &[u8]) -> usize {
72+
debug_assert!(buf.len() >= 16);
73+
74+
#[cfg(target_arch = "x86")]
75+
use core::arch::x86::*;
76+
#[cfg(target_arch = "x86_64")]
77+
use core::arch::x86_64::*;
78+
79+
let ptr = buf.as_ptr();
80+
81+
// %x21-%x7e %x80-%xff
82+
let DEL: __m128i = _mm_set1_epi8(0x7f);
83+
let LOW: __m128i = _mm_set1_epi8(0x21);
84+
85+
let dat = _mm_lddqu_si128(ptr as *const _);
86+
// unsigned comparison dat >= LOW
87+
let low = _mm_cmpeq_epi8(_mm_max_epu8(dat, LOW), dat);
88+
let del = _mm_cmpeq_epi8(dat, DEL);
89+
let bit = _mm_andnot_si128(del, low);
90+
let res = _mm_movemask_epi8(bit) as u16;
91+
92+
// TODO: use .trailing_ones() once MSRV >= 1.46
93+
(!res).trailing_zeros() as usize
94+
}
95+
6496
#[target_feature(enable = "sse4.2")]
6597
pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {
6698
while bytes.as_ref().len() >= 16 {
@@ -111,11 +143,11 @@ fn sse_code_matches_uri_chars_table() {
111143

112144
#[allow(clippy::undocumented_unsafe_blocks)]
113145
unsafe {
114-
assert!(byte_is_allowed(b'_', match_uri_vectored));
146+
assert!(byte_is_allowed(b'_', |b| match_uri_vectored(b, false)));
115147

116148
for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
117149
assert_eq!(
118-
byte_is_allowed(b as u8, match_uri_vectored), allowed,
150+
byte_is_allowed(b as u8, |b| match_uri_vectored(b, false)), allowed,
119151
"byte_is_allowed({:?}) should be {:?}", b, allowed,
120152
);
121153
}

src/simd/swar.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ const BLOCK_SIZE: usize = core::mem::size_of::<usize>();
77
type ByteBlock = [u8; BLOCK_SIZE];
88

99
#[inline]
10-
pub fn match_uri_vectored(bytes: &mut Bytes) {
10+
pub fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
1111
loop {
1212
if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
1313
let n = match_uri_char_8_swar(bytes8);
@@ -21,7 +21,7 @@ pub fn match_uri_vectored(bytes: &mut Bytes) {
2121
}
2222
}
2323
if let Some(b) = bytes.peek() {
24-
if is_uri_token(b) {
24+
if is_uri_token(b, allow_non_compliant) {
2525
// SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte
2626
// in bytes, so calling advance is safe.
2727
unsafe {

0 commit comments

Comments
 (0)