From 5286b9d43e016694927beb65ebdf6844131a15f0 Mon Sep 17 00:00:00 2001 From: tanmay4l Date: Mon, 17 Nov 2025 15:49:02 +0530 Subject: [PATCH 1/4] Optimize char deserialization with manual UTF-8 decoder --- wincode/src/schema/impls.rs | 60 +++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/wincode/src/schema/impls.rs b/wincode/src/schema/impls.rs index edda6d2f..9eb34a4f 100644 --- a/wincode/src/schema/impls.rs +++ b/wincode/src/schema/impls.rs @@ -366,12 +366,60 @@ unsafe impl<'de, C: ConfigCore> SchemaRead<'de, C> for char { } let buf = reader.fill_exact(len)?; - // TODO: Could implement a manual decoder that avoids UTF-8 validate + chars() - // and instead performs the UTF-8 validity checks and produces a `char` directly. - // Some quick micro-benchmarking revealed a roughly 2x speedup is possible, - // but this is on the order of a 1-2ns/byte delta. - let str = core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - let c = str.chars().next().unwrap(); + + // Manual UTF-8 decoder for 2x speedup by avoiding intermediate str allocation + let code_point = match len { + 2 => { + let b1 = buf[1]; + // Validate continuation byte (must be 10xxxxxx) + if (b1 & 0xC0) != 0x80 { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + ((b0 & 0x1F) as u32) << 6 | ((b1 & 0x3F) as u32) + } + 3 => { + let b1 = buf[1]; + let b2 = buf[2]; + if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + // Check for overlong encodings (< U+0800) and surrogates (U+D800..U+DFFF) + if (b0 == 0xE0 && b1 < 0xA0) || (b0 == 0xED && b1 >= 0xA0) { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + ((b0 & 0x0F) as u32) << 12 | ((b1 & 0x3F) as u32) << 6 | ((b2 & 0x3F) as u32) + } + 4 => { + let b1 = buf[1]; + let b2 = buf[2]; + let b3 = buf[3]; + if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + if (b0 == 0xF0 && b1 < 0x90) || (b0 == 0xF4 && b1 > 0x8F) { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + ((b0 & 0x07) as u32) << 18 + | ((b1 & 0x3F) as u32) << 12 + | ((b2 & 0x3F) as u32) << 6 + | ((b3 & 0x3F) as u32) + } + _ => unreachable!(), + }; + + let c = match char::from_u32(code_point) { + Some(c) => c, + None => { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + }; + unsafe { reader.consume_unchecked(len) }; dst.write(c); Ok(()) From ac9807bee97c10c370ea369723987195d109fe19 Mon Sep 17 00:00:00 2001 From: tanmay4l Date: Mon, 17 Nov 2025 21:05:03 +0530 Subject: [PATCH 2/4] Clippy-clean --- wincode/src/schema/impls.rs | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/wincode/src/schema/impls.rs b/wincode/src/schema/impls.rs index 9eb34a4f..f5f53b8b 100644 --- a/wincode/src/schema/impls.rs +++ b/wincode/src/schema/impls.rs @@ -367,14 +367,20 @@ unsafe impl<'de, C: ConfigCore> SchemaRead<'de, C> for char { let buf = reader.fill_exact(len)?; + // We re-validate with from_utf8 only on error path to get proper Utf8Error. + #[inline] + #[cold] + fn utf8_error(buf: &[u8]) -> crate::error::ReadError { + invalid_utf8_encoding(core::str::from_utf8(buf).unwrap_err()) + } + // Manual UTF-8 decoder for 2x speedup by avoiding intermediate str allocation let code_point = match len { 2 => { let b1 = buf[1]; // Validate continuation byte (must be 10xxxxxx) if (b1 & 0xC0) != 0x80 { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } ((b0 & 0x1F) as u32) << 6 | ((b1 & 0x3F) as u32) } @@ -382,13 +388,11 @@ unsafe impl<'de, C: ConfigCore> SchemaRead<'de, C> for char { let b1 = buf[1]; let b2 = buf[2]; if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } // Check for overlong encodings (< U+0800) and surrogates (U+D800..U+DFFF) if (b0 == 0xE0 && b1 < 0xA0) || (b0 == 0xED && b1 >= 0xA0) { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } ((b0 & 0x0F) as u32) << 12 | ((b1 & 0x3F) as u32) << 6 | ((b2 & 0x3F) as u32) } @@ -397,12 +401,10 @@ unsafe impl<'de, C: ConfigCore> SchemaRead<'de, C> for char { let b2 = buf[2]; let b3 = buf[3]; if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } if (b0 == 0xF0 && b1 < 0x90) || (b0 == 0xF4 && b1 > 0x8F) { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } ((b0 & 0x07) as u32) << 18 | ((b1 & 0x3F) as u32) << 12 @@ -412,13 +414,7 @@ unsafe impl<'de, C: ConfigCore> SchemaRead<'de, C> for char { _ => unreachable!(), }; - let c = match char::from_u32(code_point) { - Some(c) => c, - None => { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); - } - }; + let c = char::from_u32(code_point).ok_or_else(|| utf8_error(buf))?; unsafe { reader.consume_unchecked(len) }; dst.write(c); From c7c9b19eee799cdbcdc2a29505a5006cbc3dc21e Mon Sep 17 00:00:00 2001 From: Kamil Skalski Date: Thu, 19 Feb 2026 07:09:34 +0800 Subject: [PATCH 3/4] Add benchmark for char deserialization --- wincode/benches/benchmarks.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/wincode/benches/benchmarks.rs b/wincode/benches/benchmarks.rs index 4b38fc65..802a92a9 100644 --- a/wincode/benches/benchmarks.rs +++ b/wincode/benches/benchmarks.rs @@ -1,5 +1,6 @@ use { criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}, + rand::{Rng as _, SeedableRng}, serde::{Deserialize, Serialize}, std::{collections::HashMap, hint::black_box}, wincode::{ @@ -96,6 +97,25 @@ fn bench_primitives_comparison(c: &mut Criterion) { group.finish(); } +fn bench_char_deserialization(c: &mut Criterion) { + c.bench_function("char/wincode/deserialize", |b| { + let str: String = rand::prelude::SmallRng::seed_from_u64(0x42) + .sample_iter::(rand::distr::StandardUniform) + .take(10_000) + .collect(); + + b.iter(|| { + let mut bytes = black_box(str.as_bytes()); + let mut sum: u32 = 0; + while !bytes.is_empty() { + let ch: char = wincode::deserialize_from(&mut bytes).unwrap(); + sum = sum.wrapping_add(ch as u32); + } + black_box(sum); + }); + }); +} + fn bench_vec_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("Vec"); @@ -862,6 +882,7 @@ criterion_group!( bench_vec_unit_enum_comparison, bench_vec_same_sized_enum_comparison, bench_vec_mixed_sized_enum_comparison, + bench_char_deserialization, ); #[cfg(feature = "solana-short-vec")] From 090e66991dceb504497720215fdd38e0a3f30529 Mon Sep 17 00:00:00 2001 From: Kamil Skalski Date: Thu, 19 Feb 2026 07:10:16 +0800 Subject: [PATCH 4/4] use take_array --- wincode/src/error.rs | 2 ++ wincode/src/schema/impls.rs | 62 +++++++++++++------------------------ 2 files changed, 24 insertions(+), 40 deletions(-) diff --git a/wincode/src/error.rs b/wincode/src/error.rs index a759e06d..ce5c8bb7 100644 --- a/wincode/src/error.rs +++ b/wincode/src/error.rs @@ -34,6 +34,8 @@ pub enum ReadError { Io(#[from] io::ReadError), #[error(transparent)] InvalidUtf8Encoding(#[from] Utf8Error), + #[error("Decoded UTF-8 value {0} is not a valid character")] + InvalidUtf8Code(u32), #[error("Could not cast integer type to pointer sized type")] PointerSizedReadError, #[error( diff --git a/wincode/src/schema/impls.rs b/wincode/src/schema/impls.rs index f5f53b8b..f1d8b53e 100644 --- a/wincode/src/schema/impls.rs +++ b/wincode/src/schema/impls.rs @@ -349,74 +349,56 @@ unsafe impl<'de, C: ConfigCore> SchemaRead<'de, C> for char { #[inline] fn read(mut reader: impl Reader<'de>, dst: &mut MaybeUninit) -> ReadResult<()> { - let b0 = *reader.peek()?; - - let len = match b0 { - 0x00..=0x7F => 1, - 0xC2..=0xDF => 2, - 0xE0..=0xEF => 3, - 0xF0..=0xF4 => 4, - _ => return Err(invalid_char_lead(b0)), - }; - - if len == 1 { - unsafe { reader.consume_unchecked(1) }; - dst.write(b0 as char); - return Ok(()); - } - - let buf = reader.fill_exact(len)?; + use crate::error::ReadError; // We re-validate with from_utf8 only on error path to get proper Utf8Error. - #[inline] #[cold] - fn utf8_error(buf: &[u8]) -> crate::error::ReadError { + fn utf8_error(buf: &[u8]) -> ReadError { invalid_utf8_encoding(core::str::from_utf8(buf).unwrap_err()) } - - // Manual UTF-8 decoder for 2x speedup by avoiding intermediate str allocation - let code_point = match len { - 2 => { - let b1 = buf[1]; + let b0 = *reader.peek()?; + let code_point = match b0 { + 0x00..=0x7F => { + unsafe { reader.consume_unchecked(1) }; + dst.write(b0 as char); + return Ok(()); + } + 0xC2..=0xDF => { + let [b0, b1] = reader.take_array()?; // Validate continuation byte (must be 10xxxxxx) if (b1 & 0xC0) != 0x80 { - return Err(utf8_error(buf)); + return Err(utf8_error(&[b0, b1])); } ((b0 & 0x1F) as u32) << 6 | ((b1 & 0x3F) as u32) } - 3 => { - let b1 = buf[1]; - let b2 = buf[2]; + 0xE0..=0xEF => { + let [b0, b1, b2] = reader.take_array()?; if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 { - return Err(utf8_error(buf)); + return Err(utf8_error(&[b0, b1, b2])); } // Check for overlong encodings (< U+0800) and surrogates (U+D800..U+DFFF) if (b0 == 0xE0 && b1 < 0xA0) || (b0 == 0xED && b1 >= 0xA0) { - return Err(utf8_error(buf)); + return Err(utf8_error(&[b0, b1, b2])); } ((b0 & 0x0F) as u32) << 12 | ((b1 & 0x3F) as u32) << 6 | ((b2 & 0x3F) as u32) } - 4 => { - let b1 = buf[1]; - let b2 = buf[2]; - let b3 = buf[3]; + 0xF0..=0xF4 => { + let [b0, b1, b2, b3] = reader.take_array()?; if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 { - return Err(utf8_error(buf)); + return Err(utf8_error(&[b0, b1, b2, b3])); } if (b0 == 0xF0 && b1 < 0x90) || (b0 == 0xF4 && b1 > 0x8F) { - return Err(utf8_error(buf)); + return Err(utf8_error(&[b0, b1, b2, b3])); } ((b0 & 0x07) as u32) << 18 | ((b1 & 0x3F) as u32) << 12 | ((b2 & 0x3F) as u32) << 6 | ((b3 & 0x3F) as u32) } - _ => unreachable!(), + _ => return Err(invalid_char_lead(b0)), }; - let c = char::from_u32(code_point).ok_or_else(|| utf8_error(buf))?; - - unsafe { reader.consume_unchecked(len) }; + let c = char::from_u32(code_point).ok_or(ReadError::InvalidUtf8Code(code_point))?; dst.write(c); Ok(()) }