perf(simd): avx2 fallack to swar instead of sse4.2 (#181)

AaronO · web-flow · commit 47853d73f383 · 2024-09-03T12:19:46.000-04:00
This has massive implications on the default runtime perf, improving how the code is lowered/inlined. (Falling back to SSE4.2 for a handful of bytes was wasteful). Should supersede #175, #156
diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs
@@ -1,7 +1,7 @@
 use crate::iter::Bytes;
 
 #[inline]
-#[target_feature(enable = "avx2", enable = "sse4.2")]
+#[target_feature(enable = "avx2")]
 pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
     while bytes.as_ref().len() >= 32 {
         let advance = match_url_char_32_avx(bytes.as_ref());
@@ -11,8 +11,8 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
             return;
         }
     }
-    // do both, since avx2 only works when bytes.len() >= 32
-    super::sse42::match_uri_vectored(bytes)
+    // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2
+    super::swar::match_uri_vectored(bytes)
 }
 
 #[inline(always)]
@@ -56,7 +56,7 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
     r.trailing_zeros() as usize
 }
 
-#[target_feature(enable = "avx2", enable = "sse4.2")]
+#[target_feature(enable = "avx2")]
 pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {
     while bytes.as_ref().len() >= 32 {
         let advance = match_header_value_char_32_avx(bytes.as_ref());
@@ -66,8 +66,8 @@ pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {
             return;
         }
     }
-    // do both, since avx2 only works when bytes.len() >= 32
-    super::sse42::match_header_value_vectored(bytes)
+    // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2
+    super::swar::match_header_value_vectored(bytes)
 }
 
 #[inline(always)]

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`use crate::iter::Bytes;`
`2`	`2`
`3`	`3`	`#[inline]`
`4`		`-#[target_feature(enable = "avx2", enable = "sse4.2")]`
	`4`	`+#[target_feature(enable = "avx2")]`
`5`	`5`	`pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {`
`6`	`6`	`while bytes.as_ref().len() >= 32 {`
`7`	`7`	`let advance = match_url_char_32_avx(bytes.as_ref());`
`@@ -11,8 +11,8 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {`
`11`	`11`	`return;`
`12`	`12`	`}`
`13`	`13`	`}`
`14`		`- // do both, since avx2 only works when bytes.len() >= 32`
`15`		`- super::sse42::match_uri_vectored(bytes)`
	`14`	`+ // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2`
	`15`	`+ super::swar::match_uri_vectored(bytes)`
`16`	`16`	`}`
`17`	`17`
`18`	`18`	`#[inline(always)]`
`@@ -56,7 +56,7 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {`
`56`	`56`	`r.trailing_zeros() as usize`
`57`	`57`	`}`
`58`	`58`
`59`		`-#[target_feature(enable = "avx2", enable = "sse4.2")]`
	`59`	`+#[target_feature(enable = "avx2")]`
`60`	`60`	`pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {`
`61`	`61`	`while bytes.as_ref().len() >= 32 {`
`62`	`62`	`let advance = match_header_value_char_32_avx(bytes.as_ref());`
`@@ -66,8 +66,8 @@ pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {`
`66`	`66`	`return;`
`67`	`67`	`}`
`68`	`68`	`}`
`69`		`- // do both, since avx2 only works when bytes.len() >= 32`
`70`		`- super::sse42::match_header_value_vectored(bytes)`
	`69`	`+ // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2`
	`70`	`+ super::swar::match_header_value_vectored(bytes)`
`71`	`71`	`}`
`72`	`72`
`73`	`73`	`#[inline(always)]`