Skip to content

Commit 47380b3

Browse files
committed
feat(utf8): use a feature instead, add a specific error if utf8 is invalid
1 parent 6cb1b83 commit 47380b3

File tree

6 files changed

+65
-47
lines changed

6 files changed

+65
-47
lines changed

Cargo.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ edition = "2018"
1313
build = "build.rs"
1414

1515
[features]
16-
default = ["std"]
16+
default = ["std", "utf8_in_path"]
17+
utf8_in_path = []
1718
std = []
1819

1920
[dev-dependencies]

src/lib.rs

+40-19
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ fn is_token(b: u8) -> bool {
6262
// ASCII codes to accept URI string.
6363
// i.e. A-Z a-z 0-9 !#$%&'*+-._();:@=,/?[]~^
6464
// TODO: Make a stricter checking for URI string?
65+
#[cfg(not(feature = "utf8_in_path"))]
6566
static URI_MAP: [bool; 256] = byte_map![
6667
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6768
// \0 \n
@@ -90,7 +91,8 @@ static URI_MAP: [bool; 256] = byte_map![
9091
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9192
];
9293

93-
static URI_NON_COMPLIANT_MAP: [bool; 256] = byte_map![
94+
#[cfg(feature = "utf8_in_path")]
95+
static URI_MAP: [bool; 256] = byte_map![
9496
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9597
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9698
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -110,12 +112,8 @@ static URI_NON_COMPLIANT_MAP: [bool; 256] = byte_map![
110112
];
111113

112114
#[inline]
113-
pub(crate) fn is_uri_token(b: u8, allow_non_compliant: bool) -> bool {
114-
if allow_non_compliant {
115-
URI_NON_COMPLIANT_MAP[b as usize]
116-
} else {
117-
URI_MAP[b as usize]
118-
}
115+
pub(crate) fn is_uri_token(b: u8) -> bool {
116+
URI_MAP[b as usize]
119117
}
120118

121119
static HEADER_NAME_MAP: [bool; 256] = byte_map![
@@ -184,6 +182,9 @@ pub enum Error {
184182
TooManyHeaders,
185183
/// Invalid byte in HTTP version.
186184
Version,
185+
#[cfg(feature = "utf8_in_path")]
186+
/// Invalid UTF-8 in path.
187+
Utf8Error,
187188
}
188189

189190
impl Error {
@@ -197,6 +198,8 @@ impl Error {
197198
Error::Token => "invalid token",
198199
Error::TooManyHeaders => "too many headers",
199200
Error::Version => "invalid HTTP version",
201+
#[cfg(feature = "utf8_in_path")]
202+
Error::Utf8Error => "invalid UTF-8 in path",
200203
}
201204
}
202205
}
@@ -283,7 +286,6 @@ pub struct ParserConfig {
283286
allow_multiple_spaces_in_request_line_delimiters: bool,
284287
allow_multiple_spaces_in_response_status_delimiters: bool,
285288
allow_space_before_first_header_name: bool,
286-
allow_rfc3986_non_compliant_path: bool,
287289
ignore_invalid_headers_in_responses: bool,
288290
ignore_invalid_headers_in_requests: bool,
289291
}
@@ -563,7 +565,7 @@ impl<'h, 'b> Request<'h, 'b> {
563565
if config.allow_multiple_spaces_in_request_line_delimiters {
564566
complete!(skip_spaces(&mut bytes));
565567
}
566-
self.path = Some(complete!(parse_uri(&mut bytes, config.allow_rfc3986_non_compliant_path)));
568+
self.path = Some(complete!(parse_uri(&mut bytes)));
567569
if config.allow_multiple_spaces_in_request_line_delimiters {
568570
complete!(skip_spaces(&mut bytes));
569571
}
@@ -976,9 +978,9 @@ fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
976978
#[doc(hidden)]
977979
#[allow(missing_docs)]
978980
// WARNING: Exported for internal benchmarks, not fit for public consumption
979-
pub fn parse_uri<'a>(bytes: &mut Bytes<'a>, allow_non_compliant: bool) -> Result<&'a str> {
981+
pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
980982
let start = bytes.pos();
981-
simd::match_uri_vectored(bytes, allow_non_compliant);
983+
simd::match_uri_vectored(bytes);
982984
let end = bytes.pos();
983985

984986
if next!(bytes) == b' ' {
@@ -987,6 +989,14 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>, allow_non_compliant: bool) -> Result
987989
return Err(Error::Token);
988990
}
989991

992+
#[cfg(feature = "utf8_in_path")]
993+
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
994+
return match str::from_utf8(unsafe { bytes.slice_skip(1) }) {
995+
Ok(uri) => Ok(Status::Complete(uri)),
996+
Err(_) => Err(Error::Utf8Error),
997+
};
998+
999+
#[cfg(not(feature = "utf8_in_path"))]
9901000
return Ok(Status::Complete(
9911001
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
9921002
unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
@@ -2077,7 +2087,7 @@ mod tests {
20772087
assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize));
20782088
assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize));
20792089
assert_eq!(parse_chunk_size(b"567xf8a\r\n"), Err(crate::InvalidChunkSize));
2080-
assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, std::u64::MAX))));
2090+
assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, u64::MAX))));
20812091
assert_eq!(parse_chunk_size(b"1ffffffffffffffff\r\n"), Err(crate::InvalidChunkSize));
20822092
assert_eq!(parse_chunk_size(b"Affffffffffffffff\r\n"), Err(crate::InvalidChunkSize));
20832093
assert_eq!(parse_chunk_size(b"fffffffffffffffff\r\n"), Err(crate::InvalidChunkSize));
@@ -2185,7 +2195,7 @@ mod tests {
21852195
assert_eq!(result, Err(crate::Error::Token));
21862196
}
21872197

2188-
static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo>ohno HTTP/1.1\r\n\r\n";
2198+
static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo ohno HTTP/1.1\r\n\r\n";
21892199

21902200
#[test]
21912201
fn test_request_with_multiple_spaces_and_bad_path() {
@@ -2194,7 +2204,7 @@ mod tests {
21942204
let result = crate::ParserConfig::default()
21952205
.allow_multiple_spaces_in_request_line_delimiters(true)
21962206
.parse_request(&mut request, REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH);
2197-
assert_eq!(result, Err(crate::Error::Token));
2207+
assert_eq!(result, Err(crate::Error::Version));
21982208
}
21992209

22002210
static RESPONSE_WITH_SPACES_IN_CODE: &[u8] = b"HTTP/1.1 99 200 OK\r\n\r\n";
@@ -2702,7 +2712,8 @@ mod tests {
27022712
}
27032713

27042714
#[test]
2705-
fn test_rfc3986_non_compliant_path_ko() {
2715+
#[cfg(not(feature = "utf8_in_path"))]
2716+
fn test_utf8_in_path_ko() {
27062717
let mut headers = [EMPTY_HEADER; 1];
27072718
let mut request = Request::new(&mut headers[..]);
27082719

@@ -2712,13 +2723,12 @@ mod tests {
27122723
}
27132724

27142725
#[test]
2715-
fn test_rfc3986_non_compliant_path_ok() {
2726+
#[cfg(feature = "utf8_in_path")]
2727+
fn test_utf8_in_path_ok() {
27162728
let mut headers = [EMPTY_HEADER; 1];
27172729
let mut request = Request::new(&mut headers[..]);
2718-
let mut config = crate::ParserConfig::default();
2719-
config.allow_rfc3986_non_compliant_path = true;
27202730

2721-
let result = config.parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");
2731+
let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");
27222732

27232733
assert_eq!(result, Ok(Status::Complete(67)));
27242734
assert_eq!(request.version.unwrap(), 1);
@@ -2728,4 +2738,15 @@ mod tests {
27282738
assert_eq!(request.headers[0].name, "Host");
27292739
assert_eq!(request.headers[0].value, &b"example.org"[..]);
27302740
}
2741+
2742+
#[test]
2743+
#[cfg(feature = "utf8_in_path")]
2744+
fn test_bad_utf8_in_path() {
2745+
let mut headers = [EMPTY_HEADER; 1];
2746+
let mut request = Request::new(&mut headers[..]);
2747+
2748+
let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");
2749+
2750+
assert_eq!(result, Err(crate::Error::Utf8Error));
2751+
}
27312752
}

src/simd/avx2.rs

+8-10
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,10 @@ use crate::iter::Bytes;
22

33
#[inline]
44
#[target_feature(enable = "avx2", enable = "sse4.2")]
5-
pub unsafe fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
5+
pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
66
while bytes.as_ref().len() >= 32 {
77

8-
let advance = if allow_non_compliant {
9-
match_url_char_non_compliant_32_avx(bytes.as_ref())
10-
} else {
11-
match_url_char_32_avx(bytes.as_ref())
12-
};
8+
let advance = match_url_char_32_avx(bytes.as_ref());
139

1410
bytes.advance(advance);
1511

@@ -18,9 +14,10 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
1814
}
1915
}
2016
// do both, since avx2 only works when bytes.len() >= 32
21-
super::sse42::match_uri_vectored(bytes, allow_non_compliant)
17+
super::sse42::match_uri_vectored(bytes)
2218
}
2319

20+
#[cfg(not(feature = "utf8_in_path"))]
2421
#[inline(always)]
2522
#[allow(non_snake_case, overflowing_literals)]
2623
#[allow(unused)]
@@ -62,10 +59,11 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
6259
r.trailing_zeros() as usize
6360
}
6461

62+
#[cfg(feature = "utf8_in_path")]
6563
#[inline(always)]
6664
#[allow(non_snake_case, overflowing_literals)]
6765
#[allow(unused)]
68-
unsafe fn match_url_char_non_compliant_32_avx(buf: &[u8]) -> usize {
66+
unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
6967
debug_assert!(buf.len() >= 32);
7068

7169
#[cfg(target_arch = "x86")]
@@ -140,11 +138,11 @@ fn avx2_code_matches_uri_chars_table() {
140138

141139
#[allow(clippy::undocumented_unsafe_blocks)]
142140
unsafe {
143-
assert!(byte_is_allowed(b'_', |b| match_uri_vectored(b, false)));
141+
assert!(byte_is_allowed(b'_', match_uri_vectored));
144142

145143
for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
146144
assert_eq!(
147-
byte_is_allowed(b as u8, |b| match_uri_vectored(b, false)), allowed,
145+
byte_is_allowed(b as u8, match_uri_vectored), allowed,
148146
"byte_is_allowed({:?}) should be {:?}", b, allowed,
149147
);
150148
}

src/simd/runtime.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,13 @@ pub fn match_header_name_vectored(bytes: &mut Bytes) {
3434
super::swar::match_header_name_vectored(bytes);
3535
}
3636

37-
pub fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
37+
pub fn match_uri_vectored(bytes: &mut Bytes) {
3838
// SAFETY: calls are guarded by a feature check
3939
unsafe {
4040
match get_runtime_feature() {
41-
AVX2 => avx2::match_uri_vectored(bytes, allow_non_compliant),
42-
SSE42 => sse42::match_uri_vectored(bytes, allow_non_compliant),
43-
_ /* NOP */ => super::swar::match_uri_vectored(bytes, allow_non_compliant),
41+
AVX2 => avx2::match_uri_vectored(bytes),
42+
SSE42 => sse42::match_uri_vectored(bytes),
43+
_ /* NOP */ => super::swar::match_uri_vectored(bytes),
4444
}
4545
}
4646
}

src/simd/sse42.rs

+8-10
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,20 @@
11
use crate::iter::Bytes;
22

33
#[target_feature(enable = "sse4.2")]
4-
pub unsafe fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
4+
pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
55
while bytes.as_ref().len() >= 16 {
6-
let advance = if allow_non_compliant {
7-
match_url_char_non_compliant_16_sse(bytes.as_ref())
8-
} else {
9-
match_url_char_16_sse(bytes.as_ref())
10-
};
6+
let advance = match_url_char_16_sse(bytes.as_ref());
117

128
bytes.advance(advance);
139

1410
if advance != 16 {
1511
return;
1612
}
1713
}
18-
super::swar::match_uri_vectored(bytes, allow_non_compliant);
14+
super::swar::match_uri_vectored(bytes);
1915
}
2016

17+
#[cfg(not(feature = "utf8_in_path"))]
2118
#[inline(always)]
2219
#[allow(non_snake_case, overflowing_literals)]
2320
unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
@@ -66,9 +63,10 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
6663
r.trailing_zeros() as usize
6764
}
6865

66+
#[cfg(feature = "utf8_in_path")]
6967
#[inline(always)]
7068
#[allow(non_snake_case)]
71-
unsafe fn match_url_char_non_compliant_16_sse(buf: &[u8]) -> usize {
69+
unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
7270
debug_assert!(buf.len() >= 16);
7371

7472
#[cfg(target_arch = "x86")]
@@ -143,11 +141,11 @@ fn sse_code_matches_uri_chars_table() {
143141

144142
#[allow(clippy::undocumented_unsafe_blocks)]
145143
unsafe {
146-
assert!(byte_is_allowed(b'_', |b| match_uri_vectored(b, false)));
144+
assert!(byte_is_allowed(b'_', match_uri_vectored));
147145

148146
for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
149147
assert_eq!(
150-
byte_is_allowed(b as u8, |b| match_uri_vectored(b, false)), allowed,
148+
byte_is_allowed(b as u8, match_uri_vectored), allowed,
151149
"byte_is_allowed({:?}) should be {:?}", b, allowed,
152150
);
153151
}

src/simd/swar.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ const BLOCK_SIZE: usize = core::mem::size_of::<usize>();
77
type ByteBlock = [u8; BLOCK_SIZE];
88

99
#[inline]
10-
pub fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
10+
pub fn match_uri_vectored(bytes: &mut Bytes) {
1111
loop {
1212
if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
1313
let n = match_uri_char_8_swar(bytes8);
@@ -21,7 +21,7 @@ pub fn match_uri_vectored(bytes: &mut Bytes, allow_non_compliant: bool) {
2121
}
2222
}
2323
if let Some(b) = bytes.peek() {
24-
if is_uri_token(b, allow_non_compliant) {
24+
if is_uri_token(b) {
2525
// SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte
2626
// in bytes, so calling advance is safe.
2727
unsafe {
@@ -106,7 +106,7 @@ fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize {
106106
// A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44)
107107
// creates a u64 whose bytes are each equal to b
108108
const fn uniform_block(b: u8) -> usize {
109-
(b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
109+
(b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
110110
}
111111

112112
// A byte-wise range-check on an enire word/block,

0 commit comments

Comments
 (0)