From e6e80ac8a10edd24ca9fdb2ec9f24ba71ffc0b65 Mon Sep 17 00:00:00 2001 From: tanmay4l Date: Mon, 17 Nov 2025 15:49:02 +0530 Subject: [PATCH 1/2] Optimize char deserialization with manual UTF-8 decoder --- wincode/src/schema/impls.rs | 60 +++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/wincode/src/schema/impls.rs b/wincode/src/schema/impls.rs index af326e85..f7c9fef2 100644 --- a/wincode/src/schema/impls.rs +++ b/wincode/src/schema/impls.rs @@ -244,12 +244,60 @@ impl<'de> SchemaRead<'de> for char { } let buf = reader.fill_exact(len)?; - // TODO: Could implement a manual decoder that avoids UTF-8 validate + chars() - // and instead performs the UTF-8 validity checks and produces a `char` directly. - // Some quick micro-benchmarking revealed a roughly 2x speedup is possible, - // but this is on the order of a 1-2ns/byte delta. - let str = core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - let c = str.chars().next().unwrap(); + + // Manual UTF-8 decoder for 2x speedup by avoiding intermediate str allocation + let code_point = match len { + 2 => { + let b1 = buf[1]; + // Validate continuation byte (must be 10xxxxxx) + if (b1 & 0xC0) != 0x80 { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + ((b0 & 0x1F) as u32) << 6 | ((b1 & 0x3F) as u32) + } + 3 => { + let b1 = buf[1]; + let b2 = buf[2]; + if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + // Check for overlong encodings (< U+0800) and surrogates (U+D800..U+DFFF) + if (b0 == 0xE0 && b1 < 0xA0) || (b0 == 0xED && b1 >= 0xA0) { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + ((b0 & 0x0F) as u32) << 12 | ((b1 & 0x3F) as u32) << 6 | ((b2 & 0x3F) as u32) + } + 4 => { + let b1 = buf[1]; + let b2 = buf[2]; + let b3 = buf[3]; + if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + if (b0 == 0xF0 && b1 < 0x90) || (b0 == 0xF4 && b1 > 0x8F) { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + ((b0 & 0x07) as u32) << 18 + | ((b1 & 0x3F) as u32) << 12 + | ((b2 & 0x3F) as u32) << 6 + | ((b3 & 0x3F) as u32) + } + _ => unreachable!(), + }; + + let c = match char::from_u32(code_point) { + Some(c) => c, + None => { + core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; + unreachable!(); + } + }; + unsafe { reader.consume_unchecked(len) }; dst.write(c); Ok(()) From 3a2c574bcc7d94116d9d2e0348299df87e6a12b5 Mon Sep 17 00:00:00 2001 From: tanmay4l Date: Mon, 17 Nov 2025 21:05:03 +0530 Subject: [PATCH 2/2] Clippy-clean --- wincode/src/schema/impls.rs | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/wincode/src/schema/impls.rs b/wincode/src/schema/impls.rs index f7c9fef2..c62d36e2 100644 --- a/wincode/src/schema/impls.rs +++ b/wincode/src/schema/impls.rs @@ -245,14 +245,20 @@ impl<'de> SchemaRead<'de> for char { let buf = reader.fill_exact(len)?; + // We re-validate with from_utf8 only on error path to get proper Utf8Error. + #[inline] + #[cold] + fn utf8_error(buf: &[u8]) -> crate::error::ReadError { + invalid_utf8_encoding(core::str::from_utf8(buf).unwrap_err()) + } + // Manual UTF-8 decoder for 2x speedup by avoiding intermediate str allocation let code_point = match len { 2 => { let b1 = buf[1]; // Validate continuation byte (must be 10xxxxxx) if (b1 & 0xC0) != 0x80 { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } ((b0 & 0x1F) as u32) << 6 | ((b1 & 0x3F) as u32) } @@ -260,13 +266,11 @@ impl<'de> SchemaRead<'de> for char { let b1 = buf[1]; let b2 = buf[2]; if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } // Check for overlong encodings (< U+0800) and surrogates (U+D800..U+DFFF) if (b0 == 0xE0 && b1 < 0xA0) || (b0 == 0xED && b1 >= 0xA0) { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } ((b0 & 0x0F) as u32) << 12 | ((b1 & 0x3F) as u32) << 6 | ((b2 & 0x3F) as u32) } @@ -275,12 +279,10 @@ impl<'de> SchemaRead<'de> for char { let b2 = buf[2]; let b3 = buf[3]; if (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } if (b0 == 0xF0 && b1 < 0x90) || (b0 == 0xF4 && b1 > 0x8F) { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); + return Err(utf8_error(buf)); } ((b0 & 0x07) as u32) << 18 | ((b1 & 0x3F) as u32) << 12 @@ -290,13 +292,7 @@ impl<'de> SchemaRead<'de> for char { _ => unreachable!(), }; - let c = match char::from_u32(code_point) { - Some(c) => c, - None => { - core::str::from_utf8(buf).map_err(invalid_utf8_encoding)?; - unreachable!(); - } - }; + let c = char::from_u32(code_point).ok_or_else(|| utf8_error(buf))?; unsafe { reader.consume_unchecked(len) }; dst.write(c);