halide · abadams · Dec 27, 2024 · Dec 23, 2024
diff --git a/src/IROperator.h b/src/IROperator.h
@@ -970,28 +970,32 @@ Expr pow(Expr x, Expr y);
  * mantissa. Vectorizes cleanly. */
 Expr erf(const Expr &x);
 
-/** Fast vectorizable approximation to some trigonometric functions for Float(32).
- * Absolute approximation error is less than 1e-5. */
+/** Fast vectorizable approximation to some trigonometric functions for
+ * Float(32).  Absolute approximation error is less than 1e-5. Slow on x86 if
+ * you don't have at least sse 4.1. */
 // @{
 Expr fast_sin(const Expr &x);
 Expr fast_cos(const Expr &x);
 // @}
 
 /** Fast approximate cleanly vectorizable log for Float(32). Returns
  * nonsense for x <= 0.0f. Accurate up to the last 5 bits of the
- * mantissa. Vectorizes cleanly. */
+ * mantissa. Vectorizes cleanly. Slow on x86 if you don't
+ * have at least sse 4.1. */
 Expr fast_log(const Expr &x);
 
 /** Fast approximate cleanly vectorizable exp for Float(32). Returns
  * nonsense for inputs that would overflow or underflow. Typically
  * accurate up to the last 5 bits of the mantissa. Gets worse when
- * approaching overflow. Vectorizes cleanly. */
+ * approaching overflow. Vectorizes cleanly. Slow on x86 if you don't
+ * have at least sse 4.1. */
 Expr fast_exp(const Expr &x);
 
 /** Fast approximate cleanly vectorizable pow for Float(32). Returns
  * nonsense for x < 0.0f. Accurate up to the last 5 bits of the
  * mantissa for typical exponents. Gets worse when approaching
- * overflow. Vectorizes cleanly. */
+ * overflow. Vectorizes cleanly. Slow on x86 if you don't
+ * have at least sse 4.1. */
 Expr fast_pow(Expr x, Expr y);
 
 /** Fast approximate inverse for Float(32). Corresponds to the rcpps

diff --git a/test/performance/fast_pow.cpp b/test/performance/fast_pow.cpp
@@ -20,6 +20,12 @@ int main(int argc, char **argv) {
     printf("HL_TARGET is:     %s\n", hl_target.to_string().c_str());
     printf("HL_JIT_TARGET is: %s\n", hl_jit_target.to_string().c_str());
 
+    if (hl_jit_target.arch == Target::X86 &&
+        !hl_jit_target.has_feature(Target::SSE41)) {
+        printf("[SKIP] These intrinsics are known to be slow on x86 without sse 4.1.\n");
+        return 0;
+    }
+
     if (hl_jit_target.arch == Target::WebAssembly) {
         printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
         return 0;

diff --git a/test/performance/fast_sine_cosine.cpp b/test/performance/fast_sine_cosine.cpp
@@ -10,6 +10,13 @@ using namespace Halide::Tools;
 
 int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
+
+    if (target.arch == Target::X86 &&
+        !target.has_feature(Target::SSE41)) {
+        printf("[SKIP] These intrinsics are known to be slow on x86 without sse 4.1.\n");
+        return 0;
+    }
+
     if (target.arch == Target::WebAssembly) {
         printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
         return 0;