diff --git a/wincode/benches/benchmarks.rs b/wincode/benches/benchmarks.rs index 4b38fc65..802a92a9 100644 --- a/wincode/benches/benchmarks.rs +++ b/wincode/benches/benchmarks.rs @@ -1,5 +1,6 @@ use { criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}, + rand::{Rng as _, SeedableRng}, serde::{Deserialize, Serialize}, std::{collections::HashMap, hint::black_box}, wincode::{ @@ -96,6 +97,25 @@ fn bench_primitives_comparison(c: &mut Criterion) { group.finish(); } +fn bench_char_deserialization(c: &mut Criterion) { + c.bench_function("char/wincode/deserialize", |b| { + let str: String = rand::prelude::SmallRng::seed_from_u64(0x42) + .sample_iter::(rand::distr::StandardUniform) + .take(10_000) + .collect(); + + b.iter(|| { + let mut bytes = black_box(str.as_bytes()); + let mut sum: u32 = 0; + while !bytes.is_empty() { + let ch: char = wincode::deserialize_from(&mut bytes).unwrap(); + sum = sum.wrapping_add(ch as u32); + } + black_box(sum); + }); + }); +} + fn bench_vec_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("Vec"); @@ -862,6 +882,7 @@ criterion_group!( bench_vec_unit_enum_comparison, bench_vec_same_sized_enum_comparison, bench_vec_mixed_sized_enum_comparison, + bench_char_deserialization, ); #[cfg(feature = "solana-short-vec")] diff --git a/wincode/src/error.rs b/wincode/src/error.rs index a759e06d..ce5c8bb7 100644 --- a/wincode/src/error.rs +++ b/wincode/src/error.rs @@ -34,6 +34,8 @@ pub enum ReadError { Io(#[from] io::ReadError), #[error(transparent)] InvalidUtf8Encoding(#[from] Utf8Error), + #[error("Decoded UTF-8 value {0} is not a valid character")] + InvalidUtf8Code(u32), #[error("Could not cast integer type to pointer sized type")] PointerSizedReadError, #[error( diff --git a/wincode/src/schema/impls.rs b/wincode/src/schema/impls.rs index edda6d2f..f1d8b53e 100644 --- a/wincode/src/schema/impls.rs +++ b/wincode/src/schema/impls.rs @@ -349,30 +349,56 @@ unsafe impl<'de, C: ConfigCore> SchemaRead<'de, C> for char { #[inline] fn read(mut reader: impl Reader<'de>, dst: &mut MaybeUninit) -> ReadResult<()> { - let b0 = *reader.peek()?; + use crate::error::ReadError; - let len = match b0 { - 0x00..=0x7F => 1, - 0xC2..=0xDF => 2, - 0xE0..=0xEF => 3, - 0xF0..=0xF4 => 4, + // We re-validate with from_utf8 only on error path to get proper Utf8Error. + #[cold] + fn utf8_error(buf: &[u8]) -> ReadError { + invalid_utf8_encoding(core::str::from_utf8(buf).unwrap_err()) + } + let b0 = *reader.peek()?; + let code_point = match b0 { + 0x00..=0x7F => { + unsafe { reader.consume_unchecked(1) }; + dst.write(b0 as char); + return Ok(()); + } + 0xC2..=0xDF => { + let [b0, b1] = reader.take_array()?; + // Validate continuation byte (must be 10xxxxxx) + if (b1 & 0xC0) != 0x80 { + return Err(utf8_error(&[b0, b1])); + } + ((b0 & 0x1F) as u32) << 6 | ((b1 & 0x3F) as u32) + } + 0xE0..=0xEF => { + let [b0, b1, b2] = reader.take_array()?; + if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 { + return Err(utf8_error(&[b0, b1, b2])); + } + // Check for overlong encodings (< U+0800) and surrogates (U+D800..U+DFFF) + if (b0 == 0xE0 && b1 < 0xA0) || (b0 == 0xED && b1 >= 0xA0) { + return Err(utf8_error(&[b0, b1, b2])); + } + ((b0 & 0x0F) as u32) << 12 | ((b1 & 0x3F) as u32) << 6 | ((b2 & 0x3F) as u32) + } + 0xF0..=0xF4 => { + let [b0, b1, b2, b3] = reader.take_array()?; + if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 { + return Err(utf8_error(&[b0, b1, b2, b3])); + } + if (b0 == 0xF0 && b1 < 0x90) || (b0 == 0xF4 && b1 > 0x8F) { + return Err(utf8_error(&[b0, b1, b2, b3])); + } + ((b0 & 0x07) as u32) << 18 + | ((b1 & 0x3F) as u32) << 12 + | ((b2 & 0x3F) as u32) << 6 + | ((b3 & 0x3F) as u32) + } _ => return Err(invalid_char_lead(b0)), }; - if len == 1 { - unsafe { reader.consume_unchecked(1) }; - dst.write(b0 as char); - return Ok(()); - } - - let buf = reader.fill_exact(len)?; - // TODO: Could implement a manual decoder that avoids UTF-8 validate + chars() - // and instead performs the UTF-8 validity checks and produces a `char` directly. - // Some quick micro-benchmarking revealed a roughly 2x speedup is possible, - // but this is on the order of a 1-2ns/byte delta. - let str = core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - let c = str.chars().next().unwrap(); - unsafe { reader.consume_unchecked(len) }; + let c = char::from_u32(code_point).ok_or(ReadError::InvalidUtf8Code(code_point))?; dst.write(c); Ok(()) }