From 5c37c03ae470b84636410b7b076a86498d795f5c Mon Sep 17 00:00:00 2001 From: Fabian Wunsch Date: Fri, 16 Jun 2023 23:48:25 +0200 Subject: [PATCH 1/3] simd digit parsing --- src/parser.rs | 282 ++++++++++++++++++++++++++------------------------ 1 file changed, 148 insertions(+), 134 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 6899d79..c1ec71b 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,6 +1,6 @@ use crate::framebuffer::FrameBuffer; use const_format::formatcp; -use std::simd::{u32x8, Simd, SimdUint}; +use std::simd::{u32x4, u32x8, Simd, SimdPartialOrd, SimdUint, ToBitMask}; use std::sync::Arc; use tokio::io::AsyncWriteExt; @@ -74,163 +74,106 @@ pub async fn parse_pixelflut_commands( let current_command = unsafe { (buffer.as_ptr().add(i) as *const u64).read_unaligned() }; if current_command & 0x00ff_ffff == string_to_number(b"PX \0\0\0\0\0") { i += 3; - // Parse first x coordinate char - if buffer[i] >= b'0' && buffer[i] <= b'9' { - x = (buffer[i] - b'0') as usize; - i += 1; + let (mut x, x_size) = simd_digit_parsing(&buffer[i..i + 4]); + i += x_size + 1; + let (mut y, y_size) = simd_digit_parsing(&buffer[i..i + 4]); + i += y_size; - // Parse optional second x coordinate char - if buffer[i] >= b'0' && buffer[i] <= b'9' { - // TODO: Test bitshifts and add instead of multiplication - // i = (i << 3) + (i << 1); - // i = (i * 8) + (i * 2); - // i = 8i + 2i - // i = 10i - x = 10 * x + (buffer[i] - b'0') as usize; - i += 1; - - // Parse optional third x coordinate char - if buffer[i] >= b'0' && buffer[i] <= b'9' { - x = 10 * x + (buffer[i] - b'0') as usize; - i += 1; - - // Parse optional forth x coordinate char - if buffer[i] >= b'0' && buffer[i] <= b'9' { - x = 10 * x + (buffer[i] - b'0') as usize; - i += 1; - } - } - } + if x_size != 0 && y_size != 0 { + x += connection_x_offset; + y += connection_y_offset; - // Separator between x and y + // Separator between coordinates and color if buffer[i] == b' ' { i += 1; - // Parse first y coordinate char - if buffer[i] >= b'0' && buffer[i] <= b'9' { - y = (buffer[i] - b'0') as usize; - i += 1; + // TODO: Determine what clients use more: RGB, RGBA or gg variant. + // If RGBA is used more often move the RGB code below the RGBA code - // Parse optional second y coordinate char - if buffer[i] >= b'0' && buffer[i] <= b'9' { - y = 10 * y + (buffer[i] - b'0') as usize; - i += 1; + // Must be followed by 6 bytes RGB and newline or ... + if buffer[i + 6] == b'\n' { + last_byte_parsed = i + 6; + i += 7; // We can advance one byte more than normal as we use continue and therefore not get incremented at the end of the loop - // Parse optional third y coordinate char - if buffer[i] >= b'0' && buffer[i] <= b'9' { - y = 10 * y + (buffer[i] - b'0') as usize; - i += 1; - - // Parse optional forth y coordinate char - if buffer[i] >= b'0' && buffer[i] <= b'9' { - y = 10 * y + (buffer[i] - b'0') as usize; - i += 1; - } - } - } + let rgba: u32 = simd_unhex(&buffer[i - 7..i + 1]); - x += connection_x_offset; - y += connection_y_offset; - - // Separator between coordinates and color - if buffer[i] == b' ' { - i += 1; - - // TODO: Determine what clients use more: RGB, RGBA or gg variant. - // If RGBA is used more often move the RGB code below the RGBA code - - // Must be followed by 6 bytes RGB and newline or ... - if buffer[i + 6] == b'\n' { - last_byte_parsed = i + 6; - i += 7; // We can advance one byte more than normal as we use continue and therefore not get incremented at the end of the loop + fb.set(x, y, rgba & 0x00ff_ffff); + continue; + } - let rgba: u32 = simd_unhex(&buffer[i - 7..i + 1]); + // ... or must be followed by 8 bytes RGBA and newline + #[cfg(not(feature = "alpha"))] + if buffer[i + 8] == b'\n' { + last_byte_parsed = i + 8; + i += 9; // We can advance one byte more than normal as we use continue and therefore not get incremented at the end of the loop - fb.set(x, y, rgba & 0x00ff_ffff); - continue; - } + let rgba: u32 = simd_unhex(&buffer[i - 9..i - 1]); - // ... or must be followed by 8 bytes RGBA and newline - #[cfg(not(feature = "alpha"))] - if buffer[i + 8] == b'\n' { - last_byte_parsed = i + 8; - i += 9; // We can advance one byte more than normal as we use continue and therefore not get incremented at the end of the loop + fb.set(x, y, rgba & 0x00ff_ffff); + continue; + } + #[cfg(feature = "alpha")] + if buffer[i + 8] == b'\n' { + last_byte_parsed = i + 8; + i += 9; // We can advance one byte more than normal as we use continue and therefore not get incremented at the end of the loop - let rgba: u32 = simd_unhex(&buffer[i - 9..i - 1]); + let rgba = simd_unhex(&buffer[i - 9..i - 1]); - fb.set(x, y, rgba & 0x00ff_ffff); - continue; - } - #[cfg(feature = "alpha")] - if buffer[i + 8] == b'\n' { - last_byte_parsed = i + 8; - i += 9; // We can advance one byte more than normal as we use continue and therefore not get incremented at the end of the loop + let alpha = (rgba >> 24) & 0xff; - let rgba = simd_unhex(&buffer[i - 9..i - 1]); + if alpha == 0 || x >= fb.get_width() || y >= fb.get_height() { + continue; + } - let alpha = (rgba >> 24) & 0xff; + let alpha_comp = 0xff - alpha; + let current = fb.get_unchecked(x, y); + let r = (rgba >> 16) & 0xff; + let g = (rgba >> 8) & 0xff; + let b = rgba & 0xff; - if alpha == 0 || x >= fb.get_width() || y >= fb.get_height() { - continue; - } + let r: u32 = (((current >> 24) & 0xff) * alpha_comp + r * alpha) / 0xff; + let g: u32 = (((current >> 16) & 0xff) * alpha_comp + g * alpha) / 0xff; + let b: u32 = (((current >> 8) & 0xff) * alpha_comp + b * alpha) / 0xff; - let alpha_comp = 0xff - alpha; - let current = fb.get_unchecked(x, y); - let r = (rgba >> 16) & 0xff; - let g = (rgba >> 8) & 0xff; - let b = rgba & 0xff; - - let r: u32 = - (((current >> 24) & 0xff) * alpha_comp + r * alpha) / 0xff; - let g: u32 = - (((current >> 16) & 0xff) * alpha_comp + g * alpha) / 0xff; - let b: u32 = - (((current >> 8) & 0xff) * alpha_comp + b * alpha) / 0xff; - - fb.set(x, y, r << 16 | g << 8 | b); - continue; - } - - // ... for the efficient/lazy clients - if buffer[i + 2] == b'\n' { - last_byte_parsed = i + 2; - i += 3; // We can advance one byte more than normal as we use continue and therefore not get incremented at the end of the loop + fb.set(x, y, r << 16 | g << 8 | b); + continue; + } - let base = simd_unhex(&buffer[i - 3..i + 5]) & 0xff; + // ... for the efficient/lazy clients + if buffer[i + 2] == b'\n' { + last_byte_parsed = i + 2; + i += 3; // We can advance one byte more than normal as we use continue and therefore not get incremented at the end of the loop - let rgba: u32 = base << 16 | base << 8 | base; + let base = simd_unhex(&buffer[i - 3..i + 5]) & 0xff; - fb.set(x, y, rgba); + let rgba: u32 = base << 16 | base << 8 | base; - continue; - } - } + fb.set(x, y, rgba); - // End of command to read Pixel value - if buffer[i] == b'\n' { - last_byte_parsed = i; - i += 1; - if let Some(rgb) = fb.get(x, y) { - match stream - .write_all( - format!( - "PX {} {} {:06x}\n", - // We don't want to return the actual (absolute) coordinates, the client should also get the result offseted - x - connection_x_offset, - y - connection_y_offset, - rgb.to_be() >> 8 - ) - .as_bytes(), - ) - .await - { - Ok(_) => (), - Err(_) => continue, - } - } - continue; + continue; + } + } else if buffer[i] == b'\n' { + last_byte_parsed = i; + i += 1; + if let Some(rgb) = fb.get(x, y) { + match stream + .write_all( + format!( + "PX {} {} {:06x}\n", + // We don't want to return the actual (absolute) coordinates, the client should also get the result offseted + x - connection_x_offset, + y - connection_y_offset, + rgb.to_be() >> 8 + ) + .as_bytes(), + ) + .await + { + Ok(_) => (), + Err(_) => continue, } } + continue; } } } else if current_command & 0x0000_ffff_ffff_ffff == string_to_number(b"OFFSET \0\0") { @@ -356,6 +299,44 @@ fn simd_unhex(value: &[u8]) -> u32 { shifted.reduce_or() } +const SIMD_0_CHAR: Simd = u32x4::from_array([b'0' as u32; 4]); +const SIMD_10: Simd = u32x4::from_array([10; 4]); +const DIGIT_FACTORS: [Simd; 5] = [ + u32x4::from_array([0; 4]), + u32x4::from_array([1, 0, 0, 0]), + u32x4::from_array([10, 1, 0, 0]), + u32x4::from_array([100, 10, 1, 0]), + u32x4::from_array([1000, 100, 10, 1]), +]; + +/// count, how many digits a number has, based on the map of space characters +/// the mask is composed as follows: +/// {4th char is space}{3rd char is space}{2nd char is space}{1st char is space} +/// guarantees that the result is in (inclusive) 0-4 +#[inline(always)] +fn count_digits(space_mask: u8) -> u32 { + (space_mask | 0b10000).trailing_zeros() +} + +#[inline(always)] +fn simd_digit_parsing(value: &[u8]) -> (usize, usize) { + // using u16 instead of u32 for the simd pipeline takes 20% longer for some reason + let input = u32x4::from_array([ + value[0] as u32, + value[1] as u32, + value[2] as u32, + value[3] as u32, + ]); + let converted_digits = input - SIMD_0_CHAR; + let is_space = converted_digits.simd_ge(SIMD_10); + let space_mask = is_space.to_bitmask(); + let digits = count_digits(space_mask) as usize; + // values other than (inclusive) 0-4 are impossible + let digit_factor = unsafe { DIGIT_FACTORS.get_unchecked(digits) }; + let multiplied_digits = converted_digits * digit_factor; + (multiplied_digits.reduce_sum() as usize, digits) +} + #[cfg(test)] mod test { use super::*; @@ -365,4 +346,37 @@ mod test { assert_eq!(simd_unhex(b"01234567"), 0x67452301); assert_eq!(simd_unhex(b"fedcba98"), 0x98badcfe); } + + #[test] + fn test_count_digits() { + assert_eq!(count_digits(0b0000), 4); + assert_eq!(count_digits(0b0001), 0); + assert_eq!(count_digits(0b0010), 1); + assert_eq!(count_digits(0b0011), 0); + assert_eq!(count_digits(0b0100), 2); + assert_eq!(count_digits(0b0101), 0); + assert_eq!(count_digits(0b0110), 1); + assert_eq!(count_digits(0b0111), 0); + assert_eq!(count_digits(0b1000), 3); + assert_eq!(count_digits(0b1001), 0); + assert_eq!(count_digits(0b1010), 1); + assert_eq!(count_digits(0b1011), 0); + assert_eq!(count_digits(0b1100), 2); + assert_eq!(count_digits(0b1101), 0); + assert_eq!(count_digits(0b1110), 1); + assert_eq!(count_digits(0b1111), 0); + } + + #[test] + fn test_digit_parsing() { + assert_eq!(simd_digit_parsing(b"0123"), (123, 4)); + assert_eq!(simd_digit_parsing(b"0 23"), (0, 1)); + assert_eq!(simd_digit_parsing(b"5555"), (5555, 4)); + assert_eq!(simd_digit_parsing(b"12 3"), (12, 2)); + assert_eq!(simd_digit_parsing(b"123 "), (123, 3)); + assert_eq!(simd_digit_parsing(b"1123"), (1123, 4)); + assert_eq!(simd_digit_parsing(b" 123"), (0, 0)); + assert_eq!(simd_digit_parsing(b"1\n123"), (1, 1)); + assert_eq!(simd_digit_parsing(b"1a23"), (1, 1)); + } } From cc229f5f366d5a8ed1d4d6788dffa1edfad244bd Mon Sep 17 00:00:00 2001 From: Fabian Wunsch Date: Mon, 19 Jun 2023 17:32:06 +0200 Subject: [PATCH 2/3] not a single array lookup in sight --- src/parser.rs | 68 +++++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index c1ec71b..75cb350 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,6 +1,6 @@ use crate::framebuffer::FrameBuffer; use const_format::formatcp; -use std::simd::{u32x4, u32x8, Simd, SimdPartialOrd, SimdUint, ToBitMask}; +use std::simd::{u32x4, u32x8, Simd, SimdPartialOrd, SimdUint, ToBitMask, u8x16}; use std::sync::Arc; use tokio::io::AsyncWriteExt; @@ -74,9 +74,9 @@ pub async fn parse_pixelflut_commands( let current_command = unsafe { (buffer.as_ptr().add(i) as *const u64).read_unaligned() }; if current_command & 0x00ff_ffff == string_to_number(b"PX \0\0\0\0\0") { i += 3; - let (mut x, x_size) = simd_digit_parsing(&buffer[i..i + 4]); + let (mut x, x_size) = simd_digit_parsing(unsafe {buffer.as_ptr().add(i)}); i += x_size + 1; - let (mut y, y_size) = simd_digit_parsing(&buffer[i..i + 4]); + let (mut y, y_size) = simd_digit_parsing(unsafe {buffer.as_ptr().add(i)}); i += y_size; if x_size != 0 && y_size != 0 { @@ -299,42 +299,36 @@ fn simd_unhex(value: &[u8]) -> u32 { shifted.reduce_or() } -const SIMD_0_CHAR: Simd = u32x4::from_array([b'0' as u32; 4]); -const SIMD_10: Simd = u32x4::from_array([10; 4]); -const DIGIT_FACTORS: [Simd; 5] = [ - u32x4::from_array([0; 4]), - u32x4::from_array([1, 0, 0, 0]), - u32x4::from_array([10, 1, 0, 0]), - u32x4::from_array([100, 10, 1, 0]), - u32x4::from_array([1000, 100, 10, 1]), -]; +const SIMD_POS: Simd = u8x16::from_array([ + 255, 251, 251, 251, // interesting data + 254, 251, 251, 251, // just zero em all + 253, 251, 251, 251, // It doesn't matter that I'm subtracting + 252, 251, 251, 251, // as all values where the highest bit is 1 will be zeroed +]); +const FACTORS: Simd = u32x4::from_array([1, 10, 100, 1000]); /// count, how many digits a number has, based on the map of space characters /// the mask is composed as follows: /// {4th char is space}{3rd char is space}{2nd char is space}{1st char is space} /// guarantees that the result is in (inclusive) 0-4 #[inline(always)] -fn count_digits(space_mask: u8) -> u32 { +fn count_digits(space_mask: u16) -> u32 { (space_mask | 0b10000).trailing_zeros() } -#[inline(always)] -fn simd_digit_parsing(value: &[u8]) -> (usize, usize) { +#[inline(never)] +fn simd_digit_parsing(value: *const u8) -> (usize, usize) { // using u16 instead of u32 for the simd pipeline takes 20% longer for some reason - let input = u32x4::from_array([ - value[0] as u32, - value[1] as u32, - value[2] as u32, - value[3] as u32, - ]); - let converted_digits = input - SIMD_0_CHAR; - let is_space = converted_digits.simd_ge(SIMD_10); + let input = u8x16::from_array(unsafe {(value as *const [u8; 16]).read_unaligned()}); + let converted_digits = input - u8x16::splat(b'0'); + let is_space = converted_digits.simd_gt(u8x16::splat(9)); let space_mask = is_space.to_bitmask(); - let digits = count_digits(space_mask) as usize; - // values other than (inclusive) 0-4 are impossible - let digit_factor = unsafe { DIGIT_FACTORS.get_unchecked(digits) }; - let multiplied_digits = converted_digits * digit_factor; - (multiplied_digits.reduce_sum() as usize, digits) + let digits = count_digits(space_mask); + let swizzle_idx = SIMD_POS + u8x16::splat(digits as u8); + let swizzled = converted_digits.swizzle_dyn(swizzle_idx); + let casted_swizzle = unsafe { *(&swizzled as *const u8x16 as *const u32x4)}; + let multiplied = casted_swizzle * FACTORS; + (multiplied.reduce_sum() as usize, digits as usize) } #[cfg(test)] @@ -369,14 +363,14 @@ mod test { #[test] fn test_digit_parsing() { - assert_eq!(simd_digit_parsing(b"0123"), (123, 4)); - assert_eq!(simd_digit_parsing(b"0 23"), (0, 1)); - assert_eq!(simd_digit_parsing(b"5555"), (5555, 4)); - assert_eq!(simd_digit_parsing(b"12 3"), (12, 2)); - assert_eq!(simd_digit_parsing(b"123 "), (123, 3)); - assert_eq!(simd_digit_parsing(b"1123"), (1123, 4)); - assert_eq!(simd_digit_parsing(b" 123"), (0, 0)); - assert_eq!(simd_digit_parsing(b"1\n123"), (1, 1)); - assert_eq!(simd_digit_parsing(b"1a23"), (1, 1)); + assert_eq!(simd_digit_parsing(b"0123".as_ptr()), (123, 4)); + assert_eq!(simd_digit_parsing(b"0 23".as_ptr()), (0, 1)); + assert_eq!(simd_digit_parsing(b"5555".as_ptr()), (5555, 4)); + assert_eq!(simd_digit_parsing(b"12 3".as_ptr()), (12, 2)); + assert_eq!(simd_digit_parsing(b"123 ".as_ptr()), (123, 3)); + assert_eq!(simd_digit_parsing(b"1123".as_ptr()), (1123, 4)); + assert_eq!(simd_digit_parsing(b" 123".as_ptr()), (0, 0)); + assert_eq!(simd_digit_parsing(b"1\n123".as_ptr()), (1, 1)); + assert_eq!(simd_digit_parsing(b"1a23".as_ptr()), (1, 1)); } } From 685b97654ea30571d1872334d2027ccbcbdbf1f7 Mon Sep 17 00:00:00 2001 From: Fabian Wunsch Date: Mon, 19 Jun 2023 17:35:10 +0200 Subject: [PATCH 3/3] missed the imports while merging --- src/parser.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parser.rs b/src/parser.rs index 2c15c95..ea9b779 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,7 +1,8 @@ use crate::framebuffer::FrameBuffer; use const_format::formatcp; -use std::simd::{u32x8, Simd, SimdUint}; +use std::simd::{u32x8, Simd, SimdUint, u8x16, u32x4, SimdPartialOrd, ToBitMask}; use std::sync::Arc; +use log::{info, warn}; use tokio::io::AsyncWriteExt; pub const PARSER_LOOKAHEAD: usize = "PX 1234 1234 rrggbbaa\n".len(); // Longest possible command