Skip to content

Commit 9f6702b

Browse files
authored
Fix method parsing to reject a leading space (#190)
The `is_token` function, used exclusively for parsing the method in a request line, allows more values than it should. In particular, it allows a leading space to be parsed. This problem is not exposed in hyper, which revalidates any method extracted by httparse, otherwise I'm sure this would have been noticed sooner! Checking for a single range of valid bytes is very fast, so I've taken care to make sure that making `is_token` more complicated doesn't slow down the most common case. While exploring a variety of options, I found the existing benchmark scheme to be a bit misleading because it would test only a single method at a time, so I've made a new benchmark that roughly simulates a mix of requests. Ultimately, what I found to be a reasonable fix without any slowdown for the 99.9999% case is to check `b'A'..=b'Z'` and then fall back to a "byte map". Both methods and header names have the same set of allowed bytes, a "token", but their uses are slightly different. I thought it would make sense to rename `is_token` to `is_method_token`, to mimic `is_header_name_token`.
1 parent 97c7e6e commit 9f6702b

File tree

4 files changed

+71
-14
lines changed

4 files changed

+71
-14
lines changed

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ std = []
1818

1919
[dev-dependencies]
2020
criterion = "0.3.5"
21+
rand = "0.8.5"
2122

2223
[lib]
2324
bench = false

benches/parse.rs

+35-3
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ fn version(c: &mut Criterion) {
181181
}
182182

183183
fn method(c: &mut Criterion) {
184-
fn _method(c: &mut Criterion, name: &str, input: &'static [u8]) {
184+
fn _method(c: &mut Criterion, name: &str, input: &[u8]) {
185185
c.benchmark_group("method")
186186
.throughput(Throughput::Bytes(input.len() as u64))
187187
.bench_function(name, |b| b.iter(|| {
@@ -193,10 +193,42 @@ fn method(c: &mut Criterion) {
193193
// Common methods should be fast-pathed
194194
const COMMON_METHODS: &[&str] = &["GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"];
195195
for method in COMMON_METHODS {
196-
_method(c, &method.to_lowercase(), format!("{} / HTTP/1.1\r\n", method).into_bytes().leak());
196+
_method(c, &method.to_lowercase(), format!("{} / HTTP/1.1\r\n", method).as_bytes());
197197
}
198198
// Custom methods should be infrequent and thus not worth optimizing
199199
_method(c, "custom", b"CUSTOM / HTTP/1.1\r\n");
200+
_method(c, "w3!rd", b"w3!rd / HTTP/1.1\r\n");
201+
}
202+
203+
fn many_requests(c: &mut Criterion) {
204+
use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
205+
let mut requests = [
206+
("GET", 500),
207+
("POST", 300),
208+
("OPTIONS", 100),
209+
("HEAD", 50),
210+
("w3!r`d", 20),
211+
]
212+
.iter()
213+
.flat_map(|&(method, count)| std::iter::repeat(method).take(count))
214+
.map(|method| format!("{method} / HTTP/1.1\r\n\r\n"))
215+
.collect::<Vec<_>>();
216+
SliceRandom::shuffle(&mut *requests, &mut StdRng::seed_from_u64(0));
217+
218+
let total_bytes: usize = requests.iter().map(String::len).sum();
219+
220+
c.benchmark_group("many_requests")
221+
.throughput(Throughput::Bytes(total_bytes as u64))
222+
.measurement_time(Duration::from_secs(1))
223+
.sample_size(1000)
224+
.bench_function("_", |b| {
225+
b.iter(|| {
226+
requests.iter().for_each(|req| {
227+
let mut b = httparse::_benchable::Bytes::new(black_box(req.as_bytes()));
228+
httparse::_benchable::parse_method(&mut b).unwrap();
229+
});
230+
})
231+
});
200232
}
201233

202234
const WARMUP: Duration = Duration::from_millis(100);
@@ -205,6 +237,6 @@ const SAMPLES: usize = 200;
205237
criterion_group!{
206238
name = benches;
207239
config = Criterion::default().sample_size(SAMPLES).warm_up_time(WARMUP).measurement_time(MTIME);
208-
targets = req, req_short, resp, resp_short, uri, header, version, method
240+
targets = req, req_short, resp, resp_short, uri, header, version, method, many_requests
209241
}
210242
criterion_main!(benches);

src/lib.rs

+34-10
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ pub mod _benchable {
4444
pub use super::iter::Bytes;
4545
}
4646

47-
/// Determines if byte is a token char.
47+
/// Determines if byte is a method token char.
4848
///
4949
/// > ```notrust
5050
/// > token = 1*tchar
@@ -55,8 +55,12 @@ pub mod _benchable {
5555
/// > ; any VCHAR, except delimiters
5656
/// > ```
5757
#[inline]
58-
fn is_token(b: u8) -> bool {
59-
b > 0x1F && b < 0x7F
58+
fn is_method_token(b: u8) -> bool {
59+
match b {
60+
// For the majority case, this can be faster than the table lookup.
61+
b'A'..=b'Z' => true,
62+
_ => TOKEN_MAP[b as usize],
63+
}
6064
}
6165

6266
// ASCII codes to accept URI string.
@@ -95,7 +99,7 @@ pub(crate) fn is_uri_token(b: u8) -> bool {
9599
URI_MAP[b as usize]
96100
}
97101

98-
static HEADER_NAME_MAP: [bool; 256] = byte_map![
102+
static TOKEN_MAP: [bool; 256] = byte_map![
99103
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100104
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101105
0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
@@ -116,7 +120,7 @@ static HEADER_NAME_MAP: [bool; 256] = byte_map![
116120

117121
#[inline]
118122
pub(crate) fn is_header_name_token(b: u8) -> bool {
119-
HEADER_NAME_MAP[b as usize]
123+
TOKEN_MAP[b as usize]
120124
}
121125

122126
static HEADER_VALUE_MAP: [bool; 256] = byte_map![
@@ -930,7 +934,7 @@ fn parse_reason<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
930934
#[inline]
931935
fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
932936
let b = next!(bytes);
933-
if !is_token(b) {
937+
if !is_method_token(b) {
934938
// First char must be a token char, it can't be a space which would indicate an empty token.
935939
return Err(Error::Token);
936940
}
@@ -939,10 +943,10 @@ fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
939943
let b = next!(bytes);
940944
if b == b' ' {
941945
return Ok(Status::Complete(
942-
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
946+
// SAFETY: all bytes up till `i` must have been `is_method_token` and therefore also utf-8.
943947
unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
944948
));
945-
} else if !is_token(b) {
949+
} else if !is_method_token(b) {
946950
return Err(Error::Token);
947951
}
948952
}
@@ -964,7 +968,7 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
964968
}
965969

966970
return Ok(Status::Complete(
967-
// SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
971+
// SAFETY: all bytes up till `i` must have been `is_method_token` and therefore also utf-8.
968972
unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
969973
));
970974
} else {
@@ -1383,7 +1387,7 @@ pub fn parse_chunk_size(buf: &[u8])
13831387

13841388
#[cfg(test)]
13851389
mod tests {
1386-
use super::{Request, Response, Status, EMPTY_HEADER, parse_chunk_size};
1390+
use super::{Error, Request, Response, Status, EMPTY_HEADER, parse_chunk_size};
13871391

13881392
const NUM_OF_HEADERS: usize = 4;
13891393

@@ -2676,4 +2680,24 @@ mod tests {
26762680
assert_eq!(response.headers[0].name, "foo");
26772681
assert_eq!(response.headers[0].value, &b"bar"[..]);
26782682
}
2683+
2684+
#[test]
2685+
fn test_request_with_leading_space() {
2686+
let mut headers = [EMPTY_HEADER; 1];
2687+
let mut request = Request::new(&mut headers[..]);
2688+
let result = crate::ParserConfig::default()
2689+
.parse_request(&mut request, b" GET / HTTP/1.1\r\nfoo:bar\r\n\r\n");
2690+
2691+
assert_eq!(result, Err(Error::Token));
2692+
}
2693+
2694+
#[test]
2695+
fn test_request_with_invalid_method() {
2696+
let mut headers = [EMPTY_HEADER; 1];
2697+
let mut request = Request::new(&mut headers[..]);
2698+
let result = crate::ParserConfig::default()
2699+
.parse_request(&mut request, b"P()ST / HTTP/1.1\r\nfoo:bar\r\n\r\n");
2700+
2701+
assert_eq!(result, Err(Error::Token));
2702+
}
26792703
}

src/simd/neon.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ fn neon_code_matches_header_name_chars_table() {
235235
unsafe {
236236
assert!(byte_is_allowed(b'_', match_header_name_vectored));
237237

238-
for (b, allowed) in crate::HEADER_NAME_MAP.iter().cloned().enumerate() {
238+
for (b, allowed) in crate::TOKEN_MAP.iter().cloned().enumerate() {
239239
assert_eq!(
240240
byte_is_allowed(b as u8, match_header_name_vectored),
241241
allowed,

0 commit comments

Comments
 (0)