Skip to content

Commit

Permalink
feat(path): allow utf8 chars in path (#178)
Browse files Browse the repository at this point in the history
Closes #146
  • Loading branch information
joelwurtz authored Dec 27, 2024
1 parent 380f130 commit f1cbffc
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 97 deletions.
184 changes: 159 additions & 25 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,35 +63,26 @@ fn is_method_token(b: u8) -> bool {
}
}

// ASCII codes to accept URI string.
// i.e. A-Z a-z 0-9 !#$%&'*+-._();:@=,/?[]~^
// char codes to accept URI string.
// i.e. b'!' <= char and char != 127
// TODO: Make a stricter checking for URI string?
static URI_MAP: [bool; 256] = byte_map![
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// \0 \n
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// commands
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// \w ! " # $ % & ' ( ) * + , - . /
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// @ A B C D E F G H I J K L M N O
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// P Q R S T U V W X Y Z [ \ ] ^ _
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// ` a b c d e f g h i j k l m n o
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
// p q r s t u v w x y z { | } ~ del
// ====== Extended ASCII (aka. obs-text) ======
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
];

#[inline]
Expand Down Expand Up @@ -967,10 +958,11 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
return Err(Error::Token);
}

Ok(Status::Complete(
// SAFETY: all bytes up till `i` must have been `is_method_token` and therefore also utf-8.
unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
))
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
match str::from_utf8(unsafe { bytes.slice_skip(1) }) {
Ok(uri) => Ok(Status::Complete(uri)),
Err(_) => Err(Error::Token),
}
} else {
Err(Error::Token)
}
Expand Down Expand Up @@ -2165,7 +2157,7 @@ mod tests {
assert_eq!(result, Err(crate::Error::Token));
}

static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo>ohno HTTP/1.1\r\n\r\n";
static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo ohno HTTP/1.1\r\n\r\n";

#[test]
fn test_request_with_multiple_spaces_and_bad_path() {
Expand All @@ -2174,9 +2166,125 @@ mod tests {
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH);
assert_eq!(result, Err(crate::Error::Version));
}

// This test ensure there is an error when there is a DEL character in the path
// since we allow all char from 0x21 code except DEL, this test ensure that DEL
// is not allowed in the path
static REQUEST_WITH_DEL_IN_PATH: &[u8] = b"GET /foo\x7Fohno HTTP/1.1\r\n\r\n";

#[test]
fn test_request_with_del_in_path() {
let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS];
let mut request = Request::new(&mut headers[..]);
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, crate::tests::REQUEST_WITH_DEL_IN_PATH);
assert_eq!(result, Err(crate::Error::Token));
}

#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow for this test
fn test_all_utf8_char_in_paths() {
// two code points
for i in 128..256 {
for j in 128..256 {
let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS];
let mut request = Request::new(&mut headers[..]);
let bytes = [i as u8, j as u8];

match core::str::from_utf8(&bytes) {
Ok(s) => {
let first_line = format!("GET /{} HTTP/1.1\r\n\r\n", s);
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_bytes());

assert_eq!(result, Ok(Status::Complete(20)), "failed for utf8 char i: {}, j: {}", i, j);
},
Err(_) => {
let mut first_line = b"GET /".to_vec();
first_line.extend(&bytes);
first_line.extend(b" HTTP/1.1\r\n\r\n");

let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_slice());

assert_eq!(result, Err(crate::Error::Token), "failed for utf8 char i: {}, j: {}", i, j);
},
};

// three code points starting from 0xe0
if i < 0xe0 {
continue;
}

for k in 128..256 {
let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS];
let mut request = Request::new(&mut headers[..]);
let bytes = [i as u8, j as u8, k as u8];

match core::str::from_utf8(&bytes) {
Ok(s) => {
let first_line = format!("GET /{} HTTP/1.1\r\n\r\n", s);
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_bytes());

assert_eq!(result, Ok(Status::Complete(21)), "failed for utf8 char i: {}, j: {}, k: {}", i, j, k);
},
Err(_) => {
let mut first_line = b"GET /".to_vec();
first_line.extend(&bytes);
first_line.extend(b" HTTP/1.1\r\n\r\n");

let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_slice());

assert_eq!(result, Err(crate::Error::Token), "failed for utf8 char i: {}, j: {}, k: {}", i, j, k);
},
};

// four code points starting from 0xf0
if i < 0xf0 {
continue;
}

for l in 128..256 {
let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS];
let mut request = Request::new(&mut headers[..]);
let bytes = [i as u8, j as u8, k as u8, l as u8];

match core::str::from_utf8(&bytes) {
Ok(s) => {
let first_line = format!("GET /{} HTTP/1.1\r\n\r\n", s);
let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_bytes());

assert_eq!(result, Ok(Status::Complete(22)), "failed for utf8 char i: {}, j: {}, k: {}, l: {}", i, j, k, l);
},
Err(_) => {
let mut first_line = b"GET /".to_vec();
first_line.extend(&bytes);
first_line.extend(b" HTTP/1.1\r\n\r\n");

let result = crate::ParserConfig::default()
.allow_multiple_spaces_in_request_line_delimiters(true)
.parse_request(&mut request, first_line.as_slice());

assert_eq!(result, Err(crate::Error::Token), "failed for utf8 char i: {}, j: {}, k: {}, l: {}", i, j, k, l);
},
};
}
}
}
}
}

static RESPONSE_WITH_SPACES_IN_CODE: &[u8] = b"HTTP/1.1 99 200 OK\r\n\r\n";

#[test]
Expand Down Expand Up @@ -2700,4 +2808,30 @@ mod tests {

assert_eq!(result, Err(Error::Token));
}

#[test]
fn test_utf8_in_path_ok() {
let mut headers = [EMPTY_HEADER; 1];
let mut request = Request::new(&mut headers[..]);

let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");

assert_eq!(result, Ok(Status::Complete(67)));
assert_eq!(request.version.unwrap(), 1);
assert_eq!(request.method.unwrap(), "GET");
assert_eq!(request.path.unwrap(), "/test?post=I’msorryIforkedyou");
assert_eq!(request.headers.len(), 1);
assert_eq!(request.headers[0].name, "Host");
assert_eq!(request.headers[0].value, &b"example.org"[..]);
}

#[test]
fn test_bad_utf8_in_path() {
let mut headers = [EMPTY_HEADER; 1];
let mut request = Request::new(&mut headers[..]);

let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");

assert_eq!(result, Err(crate::Error::Token));
}
}
38 changes: 13 additions & 25 deletions src/simd/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use crate::iter::Bytes;
#[target_feature(enable = "avx2")]
pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
while bytes.as_ref().len() >= 32 {

let advance = match_url_char_32_avx(bytes.as_ref());

bytes.advance(advance);

if advance != 32 {
Expand All @@ -28,32 +30,18 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {

let ptr = buf.as_ptr();

let LSH: __m256i = _mm256_set1_epi8(0x0f);

// See comment in sse42::match_url_char_16_sse.

let URI: __m256i = _mm256_setr_epi8(
0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c,
0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c,
);
let ARF: __m256i = _mm256_setr_epi8(
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
);

let data = _mm256_lddqu_si256(ptr as *const _);
let rbms = _mm256_shuffle_epi8(URI, data);
let cols = _mm256_and_si256(LSH, _mm256_srli_epi16(data, 4));
let bits = _mm256_and_si256(_mm256_shuffle_epi8(ARF, cols), rbms);

let v = _mm256_cmpeq_epi8(bits, _mm256_setzero_si256());
let r = _mm256_movemask_epi8(v) as u32;
// %x21-%x7e %x80-%xff
let DEL: __m256i = _mm256_set1_epi8(0x7f);
let LOW: __m256i = _mm256_set1_epi8(0x21);

r.trailing_zeros() as usize
let dat = _mm256_lddqu_si256(ptr as *const _);
// unsigned comparison dat >= LOW
let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat);
let del = _mm256_cmpeq_epi8(dat, DEL);
let bit = _mm256_andnot_si256(del, low);
let res = _mm256_movemask_epi8(bit) as u32;
// TODO: use .trailing_ones() once MSRV >= 1.46
(!res).trailing_zeros() as usize
}

#[target_feature(enable = "avx2")]
Expand Down
17 changes: 6 additions & 11 deletions src/simd/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,12 @@ unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {
let input = vld1q_u8(ptr);

// Check that b'!' <= input <= b'~'
let result = vandq_u8(
vcleq_u8(vdupq_n_u8(b'!'), input),
vcleq_u8(input, vdupq_n_u8(b'~')),
);
// Check that input != b'<' and input != b'>'
let lt = vceqq_u8(input, vdupq_n_u8(b'<'));
let gt = vceqq_u8(input, vdupq_n_u8(b'>'));
let ltgt = vorrq_u8(lt, gt);
// Nand with result
let result = vbicq_u8(result, ltgt);
// Check that b'!' <= and b != 127
let result = vcleq_u8(vdupq_n_u8(b'!'), input);

// Disallow del
let del = vceqq_u8(input, vdupq_n_u8(0x7F));
let result = vbicq_u8(result, del);

offsetz(result) as usize
}
Expand Down
50 changes: 15 additions & 35 deletions src/simd/sse42.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use crate::iter::Bytes;
pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
while bytes.as_ref().len() >= 16 {
let advance = match_url_char_16_sse(bytes.as_ref());

bytes.advance(advance);

if advance != 16 {
Expand All @@ -14,7 +15,7 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
}

#[inline(always)]
#[allow(non_snake_case, overflowing_literals)]
#[allow(non_snake_case)]
unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
debug_assert!(buf.len() >= 16);

Expand All @@ -25,40 +26,19 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {

let ptr = buf.as_ptr();

let LSH: __m128i = _mm_set1_epi8(0x0f);

// The first 0xf8 corresponds to the 8 first rows of the first column
// of URI_MAP in the crate's root, with the first row corresponding to bit 0
// and the 8th row corresponding to bit 7.
// The 8 first rows give 0 0 0 1 1 1 1 1, which is 0xf8 (with least
// significant digit on the left).
//
// Another example just to drive the point home: in column 15, '>' is
// rejected, so the values are 0 0 1 0 1 1 1 1, which gives us 0xf4.
//
// Thanks to Vlad Krasnov for explaining this stuff to us mere mortals in
// a GitHub comment!
//
// https://github.com/seanmonstar/httparse/pull/89#issuecomment-807039219

let URI: __m128i = _mm_setr_epi8(
0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c,
);
let ARF: __m128i = _mm_setr_epi8(
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
);

let data = _mm_lddqu_si128(ptr as *const _);
let rbms = _mm_shuffle_epi8(URI, data);
let cols = _mm_and_si128(LSH, _mm_srli_epi16(data, 4));
let bits = _mm_and_si128(_mm_shuffle_epi8(ARF, cols), rbms);

let v = _mm_cmpeq_epi8(bits, _mm_setzero_si128());
let r = _mm_movemask_epi8(v) as u16;

r.trailing_zeros() as usize
// %x21-%x7e %x80-%xff
let DEL: __m128i = _mm_set1_epi8(0x7f);
let LOW: __m128i = _mm_set1_epi8(0x21);

let dat = _mm_lddqu_si128(ptr as *const _);
// unsigned comparison dat >= LOW
let low = _mm_cmpeq_epi8(_mm_max_epu8(dat, LOW), dat);
let del = _mm_cmpeq_epi8(dat, DEL);
let bit = _mm_andnot_si128(del, low);
let res = _mm_movemask_epi8(bit) as u16;

// TODO: use .trailing_ones() once MSRV >= 1.46
(!res).trailing_zeros() as usize
}

#[target_feature(enable = "sse4.2")]
Expand Down
2 changes: 1 addition & 1 deletion src/simd/swar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize {
// A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44)
// creates a u64 whose bytes are each equal to b
const fn uniform_block(b: u8) -> usize {
(b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
(b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
}

// A byte-wise range-check on an enire word/block,
Expand Down

0 comments on commit f1cbffc

Please sign in to comment.