diff --git a/thirdparty/simde/arm/neon/abd.h b/thirdparty/simde/arm/neon/abd.h index fdb1131ad..54dad5550 100644 --- a/thirdparty/simde/arm/neon/abd.h +++ b/thirdparty/simde/arm/neon/abd.h @@ -50,7 +50,7 @@ simde_vabdh_f16(simde_float16_t a, simde_float16_t b) { return r_ < 0 ? simde_float16_from_float32(-r_) : simde_float16_from_float32(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vabdh_f16 #define vabdh_f16(a, b) simde_vabdh_f16((a), (b)) #endif @@ -94,7 +94,7 @@ simde_vabd_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_vabs_f16(simde_vsub_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vabd_f16 #define vabd_f16(a, b) simde_vabd_f16((a), (b)) #endif @@ -315,7 +315,7 @@ simde_vabdq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_vabsq_f16(simde_vsubq_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vabdq_f16 #define vabdq_f16(a, b) simde_vabdq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/abs.h b/thirdparty/simde/arm/neon/abs.h index 3cc11d4d7..3fd507c32 100644 --- a/thirdparty/simde/arm/neon/abs.h +++ b/thirdparty/simde/arm/neon/abs.h @@ -44,7 +44,8 @@ simde_vabsd_s64(int64_t a) { return a < 0 ? -a : a; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,1,0))) #undef vabsd_s64 #define vabsd_s64(a) simde_vabsd_s64(a) #endif @@ -60,7 +61,8 @@ simde_vabsh_f16(simde_float16_t a) { return (a_ >= 0.0f) ? simde_float16_from_float32(a_) : simde_float16_from_float32(-a_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vabsh_f16 #define vabsh_f16(a) simde_vabsh_f16(a) #endif @@ -87,7 +89,8 @@ simde_vabs_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vabs_f16 #define vabs_f16(a) simde_vabs_f16(a) #endif @@ -166,7 +169,7 @@ simde_vabs_s8(simde_int8x8_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + r_.values[i] = a_.values[i] < 0 ? HEDLEY_STATIC_CAST(int8_t, -a_.values[i]) : a_.values[i]; } #endif @@ -294,7 +297,8 @@ simde_vabsq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vabsq_f16 #define vabsq_f16(a) simde_vabsq_f16(a) #endif @@ -395,7 +399,7 @@ simde_vabsq_s8(simde_int8x16_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + r_.values[i] = a_.values[i] < 0 ? HEDLEY_STATIC_CAST(int8_t, -a_.values[i]) : a_.values[i]; } #endif @@ -433,7 +437,7 @@ simde_vabsq_s16(simde_int16x8_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + r_.values[i] = a_.values[i] < 0 ? HEDLEY_STATIC_CAST(int16_t, -a_.values[i]) : a_.values[i]; } #endif diff --git a/thirdparty/simde/arm/neon/add.h b/thirdparty/simde/arm/neon/add.h index 8b4fe3499..5f2922042 100644 --- a/thirdparty/simde/arm/neon/add.h +++ b/thirdparty/simde/arm/neon/add.h @@ -46,7 +46,8 @@ simde_vaddh_f16(simde_float16_t a, simde_float16_t b) { return simde_float16_from_float32(af + bf); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vaddh_f16 #define vaddh_f16(a, b) simde_vaddh_f16((a), (b)) #endif @@ -102,7 +103,8 @@ simde_vadd_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vadd_f16 #define vadd_f16(a, b) simde_vadd_f16((a), (b)) #endif @@ -437,7 +439,8 @@ simde_vaddq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vaddq_f16 #define vaddq_f16(a, b) simde_vaddq_f16((a), (b)) #endif @@ -809,7 +812,8 @@ simde_vadd_p8(simde_poly8x8_t a, simde_poly8x8_t b) { return simde_poly8x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(_GCC_ARM_NEON_H)) #undef vadd_p8 #define vadd_p8(a, b) simde_vadd_p8((a), (b)) #endif @@ -833,7 +837,8 @@ simde_vadd_p16(simde_poly16x4_t a, simde_poly16x4_t b) { return simde_poly16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(_GCC_ARM_NEON_H)) #undef vadd_p16 #define vadd_p16(a, b) simde_vadd_p16((a), (b)) #endif @@ -858,7 +863,9 @@ simde_vadd_p64(simde_poly64x1_t a, simde_poly64x1_t b) { return simde_poly64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H))) #undef vadd_p64 #define vadd_p64(a, b) simde_vadd_p64((a), (b)) #endif @@ -882,7 +889,8 @@ simde_vaddq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { return simde_poly8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(_GCC_ARM_NEON_H)) #undef vaddq_p8 #define vaddq_p8(a, b) simde_vaddq_p8((a), (b)) #endif @@ -906,7 +914,8 @@ simde_vaddq_p16(simde_poly16x8_t a, simde_poly16x8_t b) { return simde_poly16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(_GCC_ARM_NEON_H)) #undef vaddq_p16 #define vaddq_p16(a, b) simde_vaddq_p16((a), (b)) #endif @@ -931,7 +940,9 @@ simde_vaddq_p64(simde_poly64x2_t a, simde_poly64x2_t b) { return simde_poly64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H))) #undef vaddq_p64 #define vaddq_p64(a, b) simde_vaddq_p64((a), (b)) #endif @@ -950,7 +961,9 @@ simde_vaddq_p128(simde_poly128_t a, simde_poly128_t b) { return b ^ ((0 ^ a) & mask); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H))) #undef vaddq_p128 #define vaddq_p128(a, b) simde_vaddq_p128((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/aes.h b/thirdparty/simde/arm/neon/aes.h index 4e6896fc8..bfda94516 100644 --- a/thirdparty/simde/arm/neon/aes.h +++ b/thirdparty/simde/arm/neon/aes.h @@ -84,7 +84,8 @@ simde_vaeseq_u8(simde_uint8x16_t data, simde_uint8x16_t key) { return simde_uint8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_AES))) #undef vaeseq_u8 #define vaeseq_u8(data, key) simde_vaeseq_u8((data), (key)) #endif @@ -132,7 +133,8 @@ simde_vaesdq_u8(simde_uint8x16_t data, simde_uint8x16_t key) { return simde_uint8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_AES))) #undef vaesdq_u8 #define vaesdq_u8(data, key) simde_vaesdq_u8((data), (key)) #endif @@ -160,7 +162,8 @@ simde_vaesmcq_u8(simde_uint8x16_t data) { return simde_uint8x16_from_private(a_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_AES))) #undef vaesmcq_u8 #define vaesmcq_u8(data) simde_vaesmcq_u8((data)) #endif @@ -207,7 +210,8 @@ simde_vaesimcq_u8(simde_uint8x16_t data) { return simde_uint8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_AES))) #undef vaesimcq_u8 #define vaesimcq_u8(data) simde_vaesimcq_u8((data)) #endif diff --git a/thirdparty/simde/arm/neon/and.h b/thirdparty/simde/arm/neon/and.h index 185683d75..ca00b17b1 100644 --- a/thirdparty/simde/arm/neon/and.h +++ b/thirdparty/simde/arm/neon/and.h @@ -111,7 +111,7 @@ simde_vand_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); - #elif defined(SIMDE_RISCV_V_NATIVE) + #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv64 = __riscv_vand_vv_i32m1(a_.sv64, b_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; diff --git a/thirdparty/simde/arm/neon/bsl.h b/thirdparty/simde/arm/neon/bsl.h index 40cdac89f..93ded55de 100644 --- a/thirdparty/simde/arm/neon/bsl.h +++ b/thirdparty/simde/arm/neon/bsl.h @@ -62,7 +62,8 @@ simde_vbsl_f16(simde_uint16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { return simde_vreinterpret_f16_u16(simde_uint16x4_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vbsl_f16 #define vbsl_f16(a, b, c) simde_vbsl_f16((a), (b), (c)) #endif @@ -381,7 +382,8 @@ simde_vbslq_f16(simde_uint16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { return simde_vreinterpretq_f16_u16(simde_uint16x8_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vbslq_f16 #define vbslq_f16(a, b, c) simde_vbslq_f16((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/cadd_rot270.h b/thirdparty/simde/arm/neon/cadd_rot270.h index 624a7c8a5..cc2ca641f 100644 --- a/thirdparty/simde/arm/neon/cadd_rot270.h +++ b/thirdparty/simde/arm/neon/cadd_rot270.h @@ -41,8 +41,8 @@ _Pragma("clang diagnostic ignored \"-Wimplicit-float-conversion\"") SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) return vcadd_rot270_f16(a, b); #else @@ -52,7 +52,7 @@ simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 5, 0, 7, 2); r_.values = b_.values + a_.values; @@ -68,7 +68,10 @@ simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)))) #undef vcadd_rot270_f16 #define vcadd_rot270_f16(a, b) simde_vcadd_rot270_f16(a, b) #endif @@ -76,8 +79,8 @@ simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) return vcaddq_rot270_f16(a, b); #else @@ -88,7 +91,7 @@ simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) b_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, -b_.values, b_.values, 9, 0, 11, 2, 13, 4, 15, 6); r_.values = b_.values + a_.values; @@ -104,7 +107,10 @@ simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)))) #undef vcaddq_rot270_f16 #define vcaddq_rot270_f16(a, b) simde_vcaddq_rot270_f16(a, b) #endif @@ -112,9 +118,9 @@ simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcadd_rot270_f32(simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) return vcadd_rot270_f32(a, b); #else simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); @@ -137,7 +143,10 @@ simde_float32x2_t simde_vcadd_rot270_f32(simde_float32x2_t a, simde_float32x2_t return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) #undef vcadd_rot270_f32 #define vcadd_rot270_f32(a, b) simde_vcadd_rot270_f32(a, b) #endif @@ -145,9 +154,9 @@ simde_float32x2_t simde_vcadd_rot270_f32(simde_float32x2_t a, simde_float32x2_t SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcaddq_rot270_f32(simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) return vcaddq_rot270_f32(a, b); #else simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); @@ -171,7 +180,10 @@ simde_float32x4_t simde_vcaddq_rot270_f32(simde_float32x4_t a, simde_float32x4_t return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) #undef vcaddq_rot270_f32 #define vcaddq_rot270_f32(a, b) simde_vcaddq_rot270_f32(a, b) #endif @@ -179,9 +191,9 @@ simde_float32x4_t simde_vcaddq_rot270_f32(simde_float32x4_t a, simde_float32x4_t SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcaddq_rot270_f64(simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) return vcaddq_rot270_f64(a, b); #else simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); @@ -205,7 +217,10 @@ simde_float64x2_t simde_vcaddq_rot270_f64(simde_float64x2_t a, simde_float64x2_t return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) #undef vcaddq_rot270_f64 #define vcaddq_rot270_f64(a, b) simde_vcaddq_rot270_f64(a, b) #endif diff --git a/thirdparty/simde/arm/neon/cadd_rot90.h b/thirdparty/simde/arm/neon/cadd_rot90.h index db43d0121..5e3a3cb34 100644 --- a/thirdparty/simde/arm/neon/cadd_rot90.h +++ b/thirdparty/simde/arm/neon/cadd_rot90.h @@ -41,8 +41,8 @@ _Pragma("clang diagnostic ignored \"-Wimplicit-float-conversion\"") SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) return vcadd_rot90_f16(a, b); #else @@ -52,7 +52,7 @@ simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); r_.sv64 = __riscv_vfadd_vv_f16m1(op1, a_.sv64, 4); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 1, 4, 3, 6); r_.values = b_.values + a_.values; @@ -68,7 +68,10 @@ simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)))) #undef vcadd_rot90_f16 #define vcadd_rot90_f16(a, b) simde_vcadd_rot90_f16(a, b) #endif @@ -76,8 +79,8 @@ simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) return vcaddq_rot90_f16(a, b); #else @@ -88,7 +91,7 @@ simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); r_.sv128 = __riscv_vfadd_vv_f16m1(op1, a_.sv128, 8); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) b_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, -b_.values, b_.values, 1, 8, 3, 10, 5, 12, 7, 14); r_.values = b_.values + a_.values; @@ -104,7 +107,10 @@ simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)))) #undef vcaddq_rot90_f16 #define vcaddq_rot90_f16(a, b) simde_vcaddq_rot90_f16(a, b) #endif @@ -112,9 +118,9 @@ simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcadd_rot90_f32(simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) return vcadd_rot90_f32(a, b); #else simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); @@ -137,7 +143,10 @@ simde_float32x2_t simde_vcadd_rot90_f32(simde_float32x2_t a, simde_float32x2_t b return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) #undef vcadd_rot90_f32 #define vcadd_rot90_f32(a, b) simde_vcadd_rot90_f32(a, b) #endif @@ -145,9 +154,9 @@ simde_float32x2_t simde_vcadd_rot90_f32(simde_float32x2_t a, simde_float32x2_t b SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcaddq_rot90_f32(simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) return vcaddq_rot90_f32(a, b); #else simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); @@ -171,7 +180,10 @@ simde_float32x4_t simde_vcaddq_rot90_f32(simde_float32x4_t a, simde_float32x4_t return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) #undef vcaddq_rot90_f32 #define vcaddq_rot90_f32(a, b) simde_vcaddq_rot90_f32(a, b) #endif @@ -179,9 +191,9 @@ simde_float32x4_t simde_vcaddq_rot90_f32(simde_float32x4_t a, simde_float32x4_t SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcaddq_rot90_f64(simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)) return vcaddq_rot90_f64(a, b); #else simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); @@ -205,9 +217,12 @@ simde_float64x2_t simde_vcaddq_rot90_f64(simde_float64x2_t a, simde_float64x2_t return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) -#undef vcaddq_rot90_f64 -#define vcaddq_rot90_f64(a, b) simde_vcaddq_rot90_f64(a, b) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(15, 0, 0)))) + #undef vcaddq_rot90_f64 + #define vcaddq_rot90_f64(a, b) simde_vcaddq_rot90_f64(a, b) #endif SIMDE_END_DECLS_ diff --git a/thirdparty/simde/arm/neon/cage.h b/thirdparty/simde/arm/neon/cage.h index 5d47b8aa6..0d71025c5 100644 --- a/thirdparty/simde/arm/neon/cage.h +++ b/thirdparty/simde/arm/neon/cage.h @@ -47,7 +47,8 @@ simde_vcageh_f16(simde_float16_t a, simde_float16_t b) { return (simde_math_fabsf(a_) >= simde_math_fabsf(b_)) ? UINT16_MAX : UINT16_C(0); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcageh_f16 #define vcageh_f16(a, b) simde_vcageh_f16((a), (b)) #endif @@ -99,7 +100,8 @@ simde_vcage_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcage_f16 #define vcage_f16(a, b) simde_vcage_f16((a), (b)) #endif @@ -150,7 +152,8 @@ simde_vcageq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcageq_f16 #define vcageq_f16(a, b) simde_vcageq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/cagt.h b/thirdparty/simde/arm/neon/cagt.h index 138512f88..bbe9db2c2 100644 --- a/thirdparty/simde/arm/neon/cagt.h +++ b/thirdparty/simde/arm/neon/cagt.h @@ -48,7 +48,8 @@ simde_vcagth_f16(simde_float16_t a, simde_float16_t b) { return (simde_math_fabsf(af) > simde_math_fabsf(bf)) ? UINT16_MAX : UINT16_C(0); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcagth_f16 #define vcagth_f16(a, b) simde_vcagth_f16((a), (b)) #endif @@ -99,7 +100,8 @@ simde_vcagt_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcagt_f16 #define vcagt_f16(a, b) simde_vcagt_f16((a), (b)) #endif @@ -150,7 +152,8 @@ simde_vcagtq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcagtq_f16 #define vcagtq_f16(a, b) simde_vcagtq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/cale.h b/thirdparty/simde/arm/neon/cale.h index f2baa5158..acf795c02 100644 --- a/thirdparty/simde/arm/neon/cale.h +++ b/thirdparty/simde/arm/neon/cale.h @@ -42,7 +42,8 @@ simde_vcaleh_f16(simde_float16_t a, simde_float16_t b) { return simde_vcageh_f16(b, a); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcaleh_f16 #define vcaleh_f16(a, b) simde_vcaleh_f16((a), (b)) #endif @@ -84,7 +85,8 @@ simde_vcale_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_vcage_f16(b, a); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcale_f16 #define vcale_f16(a, b) simde_vcale_f16((a), (b)) #endif @@ -126,7 +128,8 @@ simde_vcaleq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_vcageq_f16(b, a); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcaleq_f16 #define vcaleq_f16(a, b) simde_vcaleq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/calt.h b/thirdparty/simde/arm/neon/calt.h index 99fa38419..1ee960357 100644 --- a/thirdparty/simde/arm/neon/calt.h +++ b/thirdparty/simde/arm/neon/calt.h @@ -42,7 +42,8 @@ simde_vcalth_f16(simde_float16_t a, simde_float16_t b) { return simde_vcagth_f16(b, a); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcalth_f16 #define vcalth_f16(a, b) simde_vcalth_f16((a), (b)) #endif @@ -84,7 +85,8 @@ simde_vcalt_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_vcagt_f16(b, a); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcalt_f16 #define vcalt_f16(a, b) simde_vcalt_f16((a), (b)) #endif @@ -126,7 +128,8 @@ simde_vcaltq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_vcagtq_f16(b, a); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcaltq_f16 #define vcaltq_f16(a, b) simde_vcaltq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/ceq.h b/thirdparty/simde/arm/neon/ceq.h index 03a9c8612..829ef60cb 100644 --- a/thirdparty/simde/arm/neon/ceq.h +++ b/thirdparty/simde/arm/neon/ceq.h @@ -43,7 +43,8 @@ simde_vceqh_f16(simde_float16_t a, simde_float16_t b) { return (simde_float16_to_float32(a) == simde_float16_to_float32(b)) ? UINT16_MAX : UINT16_C(0); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqh_f16 #define vceqh_f16(a, b) simde_vceqh_f16((a), (b)) #endif @@ -122,7 +123,8 @@ simde_vceq_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceq_f16 #define vceq_f16(a, b) simde_vceq_f16((a), (b)) #endif @@ -432,7 +434,8 @@ simde_vceqq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqq_f16 #define vceqq_f16(a, b) simde_vceqq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/ceqz.h b/thirdparty/simde/arm/neon/ceqz.h index 54f3ce8fb..47d2ecaf7 100644 --- a/thirdparty/simde/arm/neon/ceqz.h +++ b/thirdparty/simde/arm/neon/ceqz.h @@ -47,7 +47,8 @@ simde_vceqz_f16(simde_float16x4_t a) { return simde_vceq_f16(a, simde_vdup_n_f16(SIMDE_FLOAT16_VALUE(0.0))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqz_f16 #define vceqz_f16(a) simde_vceqz_f16((a)) #endif @@ -201,7 +202,8 @@ simde_vceqzq_f16(simde_float16x8_t a) { return simde_vceqq_f16(a, simde_vdupq_n_f16(SIMDE_FLOAT16_VALUE(0.0))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqzq_f16 #define vceqzq_f16(a) simde_vceqzq_f16((a)) #endif @@ -383,7 +385,8 @@ simde_vceqzh_f16(simde_float16_t a) { return simde_vceqh_f16(a, SIMDE_FLOAT16_VALUE(0.0)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vceqzh_f16 #define vceqzh_f16(a) simde_vceqzh_f16((a)) #endif diff --git a/thirdparty/simde/arm/neon/cge.h b/thirdparty/simde/arm/neon/cge.h index ec2406dfd..5b5d77209 100644 --- a/thirdparty/simde/arm/neon/cge.h +++ b/thirdparty/simde/arm/neon/cge.h @@ -44,7 +44,8 @@ simde_vcgeh_f16(simde_float16_t a, simde_float16_t b){ return (simde_float16_to_float32(a) >= simde_float16_to_float32(b)) ? UINT16_MAX : 0; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgeh_f16 #define vcgeh_f16(a, b) simde_vcgeh_f16((a), (b)) #endif @@ -74,7 +75,8 @@ simde_vcgeq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgeq_f16 #define vcgeq_f16(a, b) simde_vcgeq_f16((a), (b)) #endif @@ -508,7 +510,8 @@ simde_vcge_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcge_f16 #define vcge_f16(a, b) simde_vcge_f16((a), (b)) #endif @@ -660,10 +663,6 @@ simde_vcge_s32(simde_int32x2_t a, simde_int32x2_t b) { r_.m64 = _mm_or_si64(_mm_cmpgt_pi32(a_.m64, b_.m64), _mm_cmpeq_pi32(a_.m64, b_.m64)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = HEDLEY_REINTERPRET_CAST(__typeof__(r_.values), a_.values >= b_.values); - #elif defined(SIMDE_RISCV_V_NATIVE) - vbool32_t result = __riscv_vmsge_vv_i32m1_b32(a_.sv64, b_.sv64, 2); - r_.sv64 = __riscv_vmv_v_x_i32m1(0, 2); - r_.sv64 = __riscv_vmerge_vxm_i32m1(r_.sv64, -1, result, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/thirdparty/simde/arm/neon/cgez.h b/thirdparty/simde/arm/neon/cgez.h index 04024c48e..5bf373302 100644 --- a/thirdparty/simde/arm/neon/cgez.h +++ b/thirdparty/simde/arm/neon/cgez.h @@ -88,7 +88,8 @@ simde_vcgezh_f16(simde_float16_t a) { return (simde_float16_to_float32(a) >= SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgezh_f16 #define vcgezh_f16(a) simde_vcgezh_f16(a) #endif @@ -110,7 +111,8 @@ simde_vcgezq_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgezq_f16 #define vcgezq_f16(a) simde_vcgezq_f16(a) #endif @@ -300,7 +302,8 @@ simde_vcgez_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgez_f16 #define vcgez_f16(a) simde_vcgez_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/cgt.h b/thirdparty/simde/arm/neon/cgt.h index f3023cbb1..b48bf70cb 100644 --- a/thirdparty/simde/arm/neon/cgt.h +++ b/thirdparty/simde/arm/neon/cgt.h @@ -92,7 +92,8 @@ simde_vcgth_f16(simde_float16_t a, simde_float16_t b) { return (a_ > b_) ? UINT16_MAX : 0; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgth_f16 #define vcgth_f16(a, b) simde_vcgth_f16((a), (b)) #endif @@ -136,7 +137,8 @@ simde_vcgtq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgtq_f16 #define vcgtq_f16(a, b) simde_vcgtq_f16((a), (b)) #endif @@ -556,7 +558,8 @@ simde_vcgt_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgt_f16 #define vcgt_f16(a, b) simde_vcgt_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/cgtz.h b/thirdparty/simde/arm/neon/cgtz.h index 30c6e5dd0..55ed0b7eb 100644 --- a/thirdparty/simde/arm/neon/cgtz.h +++ b/thirdparty/simde/arm/neon/cgtz.h @@ -76,7 +76,8 @@ simde_vcgtzh_f16(simde_float16_t a) { return (simde_float16_to_float32(a) > SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgtzh_f16 #define vcgtzh_f16(a) simde_vcgtzh_f16(a) #endif @@ -98,7 +99,8 @@ simde_vcgtzq_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgtzq_f16 #define vcgtzq_f16(a) simde_vcgtzq_f16(a) #endif @@ -302,7 +304,8 @@ simde_vcgtz_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcgtz_f16 #define vcgtz_f16(a) simde_vcgtz_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/cle.h b/thirdparty/simde/arm/neon/cle.h index b71f5c936..6ca45d5f4 100644 --- a/thirdparty/simde/arm/neon/cle.h +++ b/thirdparty/simde/arm/neon/cle.h @@ -101,7 +101,8 @@ simde_vcleh_f16(simde_float16_t a, simde_float16_t b) { return (simde_float16_to_float32(a) <= simde_float16_to_float32(b)) ? UINT16_MAX : 0; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcleh_f16 #define vcleh_f16(a, b) simde_vcleh_f16((a), (b)) #endif @@ -131,7 +132,8 @@ simde_vcleq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcleq_f16 #define vcleq_f16(a, b) simde_vcleq_f16((a), (b)) #endif @@ -586,7 +588,8 @@ simde_vcle_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcle_f16 #define vcle_f16(a, b) simde_vcle_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/clez.h b/thirdparty/simde/arm/neon/clez.h index b8f1b5f8e..9c30a9f33 100644 --- a/thirdparty/simde/arm/neon/clez.h +++ b/thirdparty/simde/arm/neon/clez.h @@ -91,7 +91,8 @@ simde_vclezh_f16(simde_float16_t a) { return (a_ <= 0.0f) ? UINT16_MAX : 0; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vclezh_f16 #define vclezh_f16(a) simde_vclezh_f16(a) #endif @@ -119,7 +120,8 @@ simde_vclezq_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vclezq_f16 #define vclezq_f16(a) simde_vclezq_f16(a) #endif @@ -339,7 +341,8 @@ simde_vclez_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vclez_f16 #define vclez_f16(a) simde_vclez_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/clt.h b/thirdparty/simde/arm/neon/clt.h index 8f1281ae0..7a5f28161 100644 --- a/thirdparty/simde/arm/neon/clt.h +++ b/thirdparty/simde/arm/neon/clt.h @@ -91,7 +91,8 @@ simde_vclth_f16(simde_float16_t a, simde_float16_t b) { return (a_ < b_) ? UINT16_MAX : 0; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vclth_f16 #define vclth_f16(a, b) simde_vclth_f16((a), (b)) #endif @@ -135,7 +136,8 @@ simde_vcltq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcltq_f16 #define vcltq_f16(a, b) simde_vcltq_f16((a), (b)) #endif @@ -564,7 +566,8 @@ simde_vclt_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vclt_f16 #define vclt_f16(a, b) simde_vclt_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/cltz.h b/thirdparty/simde/arm/neon/cltz.h index 2c61d1a16..a4d7f54f9 100644 --- a/thirdparty/simde/arm/neon/cltz.h +++ b/thirdparty/simde/arm/neon/cltz.h @@ -91,7 +91,8 @@ simde_vcltzh_f16(simde_float16_t a) { return (simde_float16_to_float32(a) < SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcltzh_f16 #define vcltzh_f16(a) simde_vcltzh_f16(a) #endif @@ -113,7 +114,8 @@ simde_vcltz_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcltz_f16 #define vcltz_f16(a) simde_vcltz_f16(a) #endif @@ -255,7 +257,8 @@ simde_vcltzq_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcltzq_f16 #define vcltzq_f16(a) simde_vcltzq_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/cmla.h b/thirdparty/simde/arm/neon/cmla.h index 68b9a0065..558475533 100644 --- a/thirdparty/simde/arm/neon/cmla.h +++ b/thirdparty/simde/arm/neon/cmla.h @@ -37,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) @@ -63,7 +63,10 @@ simde_vcmla_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmla_f16 #define vcmla_f16(r, a, b) simde_vcmla_f16(r, a, b) #endif @@ -71,7 +74,7 @@ simde_vcmla_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARCH_ARM_COMPLEX) @@ -95,7 +98,10 @@ simde_vcmla_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmla_f32 #define vcmla_f32(r, a, b) simde_vcmla_f32(r, a, b) #endif @@ -103,7 +109,7 @@ simde_vcmla_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) @@ -129,7 +135,10 @@ simde_vcmlaq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_f16 #define vcmlaq_f16(r, a, b) simde_vcmlaq_f16(r, a, b) #endif @@ -137,7 +146,7 @@ simde_vcmlaq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARCH_ARM_COMPLEX) @@ -163,7 +172,10 @@ simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_f32 #define vcmlaq_f32(r, a, b) simde_vcmlaq_f32(r, a, b) #endif @@ -171,7 +183,7 @@ simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcmlaq_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARCH_ARM_COMPLEX) @@ -197,7 +209,10 @@ simde_vcmlaq_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_f64 #define vcmlaq_f64(r, a, b) simde_vcmlaq_f64(r, a, b) #endif diff --git a/thirdparty/simde/arm/neon/cmla_lane.h b/thirdparty/simde/arm/neon/cmla_lane.h index 9a9f885d3..0415641e3 100644 --- a/thirdparty/simde/arm/neon/cmla_lane.h +++ b/thirdparty/simde/arm/neon/cmla_lane.h @@ -35,6 +35,8 @@ #include "get_low.h" #include "mul.h" #include "types.h" +#include "cmla.h" +#include "get_lane.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ @@ -44,42 +46,41 @@ simde_float16x4_t simde_vcmla_lane_f16(simde_float16x4_t r, simde_float16x4_t a, SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - simde_float16x4_private r_ = simde_float16x4_to_private(r), - a_ = simde_float16x4_to_private(a), - b_ = simde_float16x4_to_private(b); - uint16_t idx1[4] = {0, 0, 2, 2}; - vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ - a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); - r_.sv64 = __riscv_vfmacc_vf_f16m1(r_.sv64, b_.values[lane], op1, 4); - return simde_float16x4_from_private(r_); - #else + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {0, 0, 2, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, b_.sv64, op1, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), - a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } - #endif + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_lane_f16 - #define vcmla_lane_f16(r, a, b, lane) simde_vcmla_lane_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_lane_f16(r, a, b, lane) vcmla_lane_f16(r, a, b, lane) +#else + #define simde_vcmla_lane_f16(r, a, b, lane) simde_vcmla_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_lane_f16 + #define vcmla_lane_f16(r, a, b, lane) simde_vcmla_lane_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -89,212 +90,208 @@ simde_float32x2_t simde_vcmla_lane_f32(simde_float32x2_t r, simde_float32x2_t a, #if defined(SIMDE_RISCV_V_NATIVE) simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(b); + b_ = simde_float32x2_to_private(simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); uint32_t idx1[2] = {0, 0}; vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); - r_.sv64 = __riscv_vfmacc_vf_f32m1(r_.sv64, b_.values[lane], op1, 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, b_.sv64, op1, 2); return simde_float32x2_from_private(r_); #else - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); #if defined(SIMDE_SHUFFLE_VECTOR_) + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } + return simde_vcmla_f32(r, a, b_tmp); #endif - return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_lane_f32 - #define vcmla_lane_f32(r, a, b, lane) simde_vcmla_lane_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_lane_f32(r, a, b, lane) vcmla_lane_f32(r, a, b, lane) +#else + #define simde_vcmla_lane_f32(r, a, b, lane) simde_vcmla_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_lane_f32 + #define vcmla_lane_f32(r, a, b, lane) simde_vcmla_lane_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x4_t simde_vcmla_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) +simde_float16x8_t simde_vcmlaq_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - simde_float16x4_private r_ = simde_float16x4_to_private(r), - a_ = simde_float16x4_to_private(a); - simde_float16x8_private b_ = simde_float16x8_to_private(b); - uint16_t idx1[4] = {0, 0, 2, 2}; - vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ - a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); - r_.sv64 = __riscv_vfmacc_vf_f16m1(r_.sv64, b_.values[lane], op1, 4); - return simde_float16x4_from_private(r_); - #else - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), - a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); - r_.values += b_.values * a_.values; + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2( \ + __riscv_vslideup_vx_f16m2(a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, b_.sv128, op1, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } - #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + return simde_vcmlaq_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_laneq_f16 - #define vcmla_laneq_f16(r, a, b, lane) simde_vcmla_laneq_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) - #define simde_vcmla_laneq_f16(r, a, b, lane) vcmla_laneq_f16(r, a, b, lane) + #define simde_vcmlaq_lane_f16(r, a, b, lane) vcmlaq_lane_f16(r, a, b, lane) +#else + #define simde_vcmlaq0_lane_f16(r, a, b, lane) simde_vcmlaq_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_lane_f16 + #define vcmlaq_lane_f16(r, a, b, lane) simde_vcmlaq_lane_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x2_t simde_vcmla_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) - SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +simde_float32x4_t simde_vcmlaq_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { #if defined(SIMDE_RISCV_V_NATIVE) - simde_float32x2_private r_ = simde_float32x2_to_private(r), - a_ = simde_float32x2_to_private(a); - simde_float32x4_private b_ = simde_float32x4_to_private(b); - uint32_t idx1[2] = {0, 0}; - vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); - r_.sv64 = __riscv_vfmacc_vf_f32m1(r_.sv64, b_.values[lane], op1, 2); - return simde_float32x2_from_private(r_); + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {0, 0, 2, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2( \ + __riscv_vslideup_vx_f32m2(a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, b_.sv128, op1, 4); + return simde_float32x4_from_private(r_); #else - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); + simde_float32x4_t b_tmp = simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); #if defined(SIMDE_SHUFFLE_VECTOR_) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } + return simde_vcmlaq_f32(r, a, b_tmp); #endif - return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_laneq_f32 - #define vcmla_laneq_f32(r, a, b, lane) simde_vcmla_laneq_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) - #define simde_vcmla_laneq_f32(r, a, b, lane) vcmla_laneq_f32(r, a, b, lane) + #define simde_vcmlaq_lane_f32(r, a, b, lane) vcmlaq_lane_f32(r, a, b, 0) +#else + #define simde_vcmlaq_lane_f32(r, a, b, lane) simde_vcmlaq_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_lane_f32 + #define vcmlaq_lane_f32(r, a, b, lane) simde_vcmlaq_lane_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x8_t simde_vcmlaq_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) +simde_float16x4_t simde_vcmla_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - simde_float16x8_private r_ = simde_float16x8_to_private(r), - a_ = simde_float16x8_to_private(a); - simde_float16x4_private b_ = simde_float16x4_to_private(b); - uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; - vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); - vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2( \ - __riscv_vslideup_vx_f16m2(a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); - r_.sv128 = __riscv_vfmacc_vf_f16m1(r_.sv128, b_.values[lane], op1, 8); - return simde_float16x8_from_private(r_); + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {0, 0, 2, 2}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, b_.sv64, op1, 4); + return simde_float16x4_from_private(r_); #else - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); - a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i]; - r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i]; - r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i]; - r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i]; - } + return simde_vcmla_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_lane_f16 - #define vcmlaq_lane_f16(r, a, b, lane) simde_vcmlaq_lane_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) - #define simde_vcmlaq_lane_f16(r, a, b, lane) vcmlaq_lane_f16(r, a, b, lane) + #define simde_vcmla_laneq_f16(r, a, b, lane) vcmla_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_laneq_f16 + #define vcmla_laneq_f16(r, a, b, lane) simde_vcmla_laneq_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x4_t simde_vcmlaq_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) - SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +simde_float32x2_t simde_vcmla_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - simde_float32x4_private r_ = simde_float32x4_to_private(r), - a_ = simde_float32x4_to_private(a); - simde_float32x2_private b_ = simde_float32x2_to_private(b); - uint32_t idx1[4] = {0, 0, 2, 2}; - vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); - vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2( \ - __riscv_vslideup_vx_f32m2(a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); - r_.sv128 = __riscv_vfmacc_vf_f32m1(r_.sv128, b_.values[lane], op1, 4); - return simde_float32x4_from_private(r_); + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {0, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, b_.sv64, op1, 2); + return simde_float32x2_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); #else - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); - #if defined(SIMDE_SHUFFLE_VECTOR_) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } - #endif - return simde_float32x4_from_private(r_); + return simde_vcmla_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_lane_f32 - #define vcmlaq_lane_f32(r, a, b, lane) simde_vcmlaq_lane_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) - #define simde_vcmlaq_lane_f32(r, a, b, lane) vcmlaq_lane_f32(r, a, b, 0); + #define simde_vcmla_laneq_f32(r, a, b, lane) vcmla_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_laneq_f32 + #define vcmla_laneq_f32(r, a, b, lane) simde_vcmla_laneq_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -302,93 +299,86 @@ simde_float16x8_t simde_vcmlaq_laneq_f16(simde_float16x8_t r, simde_float16x8_t SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - simde_float16x8_private r_ = simde_float16x8_to_private(r), - a_ = simde_float16x8_to_private(a); - simde_float16x8_private b_ = simde_float16x8_to_private(b); - uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; - vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); - vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2( \ - __riscv_vslideup_vx_f16m2(a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); - r_.sv128 = __riscv_vfmacc_vf_f16m1(r_.sv128, b_.values[lane], op1, 8); - return simde_float16x8_from_private(r_); - #else + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2( \ + __riscv_vslideup_vx_f16m2(a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, b_.sv128, op1, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); - r_low.values += b_.values * a_low.values; - a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); - r_high.values += b_.values * a_high.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i]; - r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i]; - r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i]; - r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + r_low.values += b_.values * a_low.values; + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + r_high.values += b_.values * a_high.values; return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_laneq_f16 - #define vcmlaq_laneq_f16(r, a, b, lane) simde_vcmlaq_laneq_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_laneq_f16(r, a, b, lane) vcmlaq_laneq_f16(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_laneq_f16 + #define vcmlaq_laneq_f16(r, a, b, lane) simde_vcmlaq_laneq_f16(r, a, b, lane) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) - simde_float32x4_private r_ = simde_float32x4_to_private(r), - a_ = simde_float32x4_to_private(a); - simde_float32x4_private b_ = simde_float32x4_to_private(b); - uint32_t idx1[4] = {0, 0, 2, 2}; - vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); - vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2( \ - __riscv_vslideup_vx_f32m2(a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); - r_.sv128 = __riscv_vfmacc_vf_f32m1(r_.sv128, b_.values[lane], op1, 4); - return simde_float32x4_from_private(r_); + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {0, 0, 2, 2}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2( \ + __riscv_vslideup_vx_f32m2(a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, b_.sv128, op1, 4); + return simde_float32x4_from_private(r_); #else - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); - + simde_float32x4_t b_tmp = simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); #if defined(SIMDE_SHUFFLE_VECTOR_) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b_tmp); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; - r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; - } + return simde_vcmlaq_f32(r, a, b_tmp); #endif - return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_laneq_f32 - #define vcmlaq_laneq_f32(r, a, b, lane) simde_vcmlaq_laneq_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_laneq_f32(r, a, b, lane) vcmlaq_laneq_f32(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_laneq_f32 + #define vcmlaq_laneq_f32(r, a, b, lane) simde_vcmlaq_laneq_f32(r, a, b, lane) +#endif SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/arm/neon/cmla_rot180.h b/thirdparty/simde/arm/neon/cmla_rot180.h index 44cf28312..bfa7ee840 100644 --- a/thirdparty/simde/arm/neon/cmla_rot180.h +++ b/thirdparty/simde/arm/neon/cmla_rot180.h @@ -37,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_rot180_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) @@ -63,18 +63,56 @@ simde_vcmla_rot180_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4 return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmla_rot180_f16 #define vcmla_rot180_f16(r, a, b) simde_vcmla_rot180_f16(r, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcmla_rot180_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcmla_rot180_f32(r, a, b); + #else + simde_float32x2_private + r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + + #if defined(SIMDE_SHUFFLE_VECTOR_) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; + r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; + } + #endif + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot180_f32 + #define vcmla_rot180_f32(r, a, b) simde_vcmla_rot180_f32(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vcmlaq_rot180_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ - defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) return vcmlaq_rot180_f16(r, a, b); #else simde_float16x8_private @@ -97,50 +135,18 @@ simde_vcmlaq_rot180_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) #undef vcmlaq_rot180_f16 #define vcmlaq_rot180_f16(r, a, b) simde_vcmlaq_rot180_f16(r, a, b) #endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float32x2_t -simde_vcmla_rot180_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ - defined(SIMDE_ARCH_ARM_COMPLEX) - return vcmla_rot180_f32(r, a, b); - #else - simde_float32x2_private - r_ = simde_float32x2_to_private(r), - a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { - r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; - r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; - } - #endif - - return simde_float32x2_from_private(r_); - #endif -} -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot180_f32 - #define vcmla_rot180_f32(r, a, b) simde_vcmla_rot180_f32(r, a, b) -#endif - SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARCH_ARM_COMPLEX) @@ -170,7 +176,10 @@ simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) #undef vcmlaq_rot180_f32 #define vcmlaq_rot180_f32(r, a, b) simde_vcmlaq_rot180_f32(r, a, b) #endif @@ -178,7 +187,7 @@ simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcmlaq_rot180_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARCH_ARM_COMPLEX) @@ -208,7 +217,10 @@ simde_vcmlaq_rot180_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_rot180_f64 #define vcmlaq_rot180_f64(r, a, b) simde_vcmlaq_rot180_f64(r, a, b) #endif diff --git a/thirdparty/simde/arm/neon/cmla_rot180_lane.h b/thirdparty/simde/arm/neon/cmla_rot180_lane.h index 8800fbc26..19f6b7fbb 100644 --- a/thirdparty/simde/arm/neon/cmla_rot180_lane.h +++ b/thirdparty/simde/arm/neon/cmla_rot180_lane.h @@ -35,103 +35,114 @@ #include "get_low.h" #include "mul.h" #include "types.h" +#include "cmla_rot180.h" +#include "get_lane.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES -simde_float16x4_t simde_vcmla_rot180_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) +simde_float16x4_t +simde_vcmla_rot180_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - simde_float16x4_private r_ = simde_float16x4_to_private(r), - a_ = simde_float16x4_to_private(a), - b_ = simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]); - uint16_t idx1[4] = {0, 0, 2, 2}; - uint16_t idx2[4] = {0, 1, 2, 3}; - vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ - a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {0, 0, 2, 2}; + uint16_t idx2[4] = {0, 1, 2, 3}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ - __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); - r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); - return simde_float16x4_from_private(r_); - #else + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; - r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot180_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot180_lane_f16 - #define vcmla_rot180_lane_f16(r, a, b, lane) simde_vcmla_rot180_lane_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot180_lane_f16(r, a, b, lane) vcmla_rot180_lane_f16(r, a, b, lane) +#else + #define simde_vcmla_rot180_lane_f16(r, a, b, lane) simde_vcmla_rot180_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot180_lane_f16 + #define vcmla_rot180_lane_f16(r, a, b, lane) simde_vcmla_rot180_lane_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x2_t simde_vcmla_rot180_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) +simde_float32x2_t +simde_vcmla_rot180_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[2] = {0, 0}; - uint32_t idx2[2] = {0, 1}; - vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); - vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); - r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {0, 0}; + uint32_t idx2[2] = {0, 1}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; - r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; - } + return simde_vcmla_rot180_f32(r, a, b_tmp); #endif - return simde_float32x2_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot180_lane_f32 - #define vcmla_rot180_lane_f32(r, a, b, lane) simde_vcmla_rot180_lane_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot180_lane_f32(r, a, b, lane) vcmla_rot180_lane_f32(r, a, b, lane) +#else + #define simde_vcmla_rot180_lane_f32(r, a, b, lane) simde_vcmla_rot180_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot180_lane_f32 + #define vcmla_rot180_lane_f32(r, a, b, lane) simde_vcmla_rot180_lane_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x8_t simde_vcmlaq_rot180_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) +simde_float16x8_t +simde_vcmlaq_rot180_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH simde_float16x8_private r_ = simde_float16x8_to_private(r), a_ = simde_float16x8_to_private(a), - b_ = simde_vdupq_n_f16(simde_float16x4_to_private(b).values[lane]); + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; uint16_t idx2[8] = {0, 1, 2, 3, 4, 5, 6, 7}; vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); @@ -142,92 +153,95 @@ simde_float16x8_t simde_vcmlaq_rot180_lane_f16(simde_float16x8_t r, simde_float1 __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); return simde_float16x8_from_private(r_); - #else + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); - a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i]; - r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i]; - r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i]; - r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot180_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot180_lane_f16 - #define vcmlaq_rot180_lane_f16(r, a, b, lane) simde_vcmlaq_rot180_lane_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot180_lane_f16(r, a, b, lane) vcmlaq_rot180_lane_f16(r, a, b, lane) +#else + #define simde_vcmlaq_rot180_lane_f16(r, a, b, lane) simde_vcmlaq_rot180_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot180_lane_f16 + #define vcmlaq_rot180_lane_f16(r, a, b, lane) simde_vcmlaq_rot180_lane_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x4_t simde_vcmlaq_rot180_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) +simde_float32x4_t +simde_vcmlaq_rot180_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[4] = {0, 0, 2, 2}; - uint32_t idx2[4] = {0, 1, 2, 3}; - vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); - vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); - vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ - a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); - vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ - __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); - r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {0, 0, 2, 2}; + uint32_t idx2[4] = {0, 1, 2, 3}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; - r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; - } + return simde_vcmlaq_rot180_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); #endif - return simde_float32x4_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot180_lane_f32 - #define vcmlaq_rot180_lane_f32(r, a, b, lane) simde_vcmlaq_rot180_lane_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot180_lane_f32(r, a, b, lane) vcmlaq_rot180_lane_f32(r, a, b, lane) +#else + #define simde_vcmlaq_rot180_lane_f32(r, a, b, lane) simde_vcmlaq_rot180_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot180_lane_f32 + #define vcmlaq_rot180_lane_f32(r, a, b, lane) simde_vcmlaq_rot180_lane_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x4_t simde_vcmla_rot180_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) +simde_float16x4_t +simde_vcmla_rot180_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH simde_float16x4_private r_ = simde_float16x4_to_private(r), a_ = simde_float16x4_to_private(a), - b_ = simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]); + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); uint16_t idx1[4] = {0, 0, 2, 2}; uint16_t idx2[4] = {0, 1, 2, 3}; vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ @@ -236,83 +250,86 @@ simde_float16x4_t simde_vcmla_rot180_laneq_f16(simde_float16x4_t r, simde_float1 __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); return simde_float16x4_from_private(r_); - #else + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; - r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot180_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot180_laneq_f16 - #define vcmla_rot180_laneq_f16(r, a, b, lane) simde_vcmla_rot180_laneq_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot180_laneq_f16(r, a, b, lane) vcmla_rot180_laneq_f16(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot180_laneq_f16 + #define vcmla_rot180_laneq_f16(r, a, b, lane) simde_vcmla_rot180_laneq_f16(r, a, b, lane) +#endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x2_t simde_vcmla_rot180_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) +simde_float32x2_t +simde_vcmla_rot180_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[2] = {0, 0}; - uint32_t idx2[2] = {0, 1}; - vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); - vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); - r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {0, 0}; + uint32_t idx2[2] = {0, 1}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; - r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; - } + return simde_vcmla_rot180_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); #endif - return simde_float32x2_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot180_laneq_f32 - #define vcmla_rot180_laneq_f32(r, a, b, lane) simde_vcmla_rot180_laneq_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot180_laneq_f32(r, a, b, lane) vcmla_rot180_laneq_f32(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot180_laneq_f32 + #define vcmla_rot180_laneq_f32(r, a, b, lane) simde_vcmla_rot180_laneq_f32(r, a, b, lane) +#endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x8_t simde_vcmlaq_rot180_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, +simde_float16x8_t +simde_vcmlaq_rot180_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH simde_float16x8_private r_ = simde_float16x8_to_private(r), a_ = simde_float16x8_to_private(a), - b_ = simde_vdupq_n_f16(simde_float16x8_to_private(b).values[lane]); + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); uint16_t idx1[8] = {0, 0, 2, 2, 4, 4, 6, 6}; uint16_t idx2[8] = {0, 1, 2, 3, 4, 5, 6, 7}; vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); @@ -323,83 +340,82 @@ simde_float16x8_t simde_vcmlaq_rot180_laneq_f16(simde_float16x8_t r, simde_float __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); return simde_float16x8_from_private(r_); - #else + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); - a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i]; - r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i]; - r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i]; - r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot180_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot180_laneq_f16 - #define vcmlaq_rot180_laneq_f16(r, a, b, lane) simde_vcmlaq_rot180_laneq_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot180_laneq_f16(r, a, b, lane) vcmlaq_rot180_laneq_f16(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot180_laneq_f16 + #define vcmlaq_rot180_laneq_f16(r, a, b, lane) simde_vcmlaq_rot180_laneq_f16(r, a, b, lane) +#endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x4_t simde_vcmlaq_rot180_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, - const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +simde_float32x4_t +simde_vcmlaq_rot180_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[4] = {0, 0, 2, 2}; - uint32_t idx2[4] = {0, 1, 2, 3}; - vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); - vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); - vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ - a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); - vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ - __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); - r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {0, 0, 2, 2}; + uint32_t idx2[4] = {0, 1, 2, 3}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; - r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; - } + return simde_vcmlaq_rot180_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); #endif - return simde_float32x4_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot180_laneq_f32 - #define vcmlaq_rot180_laneq_f32(r, a, b, lane) simde_vcmlaq_rot180_laneq_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot180_laneq_f32(r, a, b, lane) vcmlaq_rot180_laneq_f32(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot180_laneq_f32 + #define vcmlaq_rot180_laneq_f32(r, a, b, lane) simde_vcmlaq_rot180_laneq_f32(r, a, b, lane) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/arm/neon/cmla_rot270.h b/thirdparty/simde/arm/neon/cmla_rot270.h index 530a30ae9..363f0b2ff 100644 --- a/thirdparty/simde/arm/neon/cmla_rot270.h +++ b/thirdparty/simde/arm/neon/cmla_rot270.h @@ -37,10 +37,9 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_rot270_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ - defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) return vcmla_rot270_f16(r, a, b); #else simde_float16x4_private @@ -63,52 +62,20 @@ simde_vcmla_rot270_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4 return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) #undef vcmla_rot270_f16 #define vcmla_rot270_f16(r, a, b) simde_vcmla_rot270_f16(r, a, b) #endif -SIMDE_FUNCTION_ATTRIBUTES -simde_float16x8_t -simde_vcmlaq_rot270_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ - defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) - return vcmlaq_rot270_f16(r, a, b); - #else - simde_float16x8_private - r_ = simde_float16x8_to_private(r), - a_ = simde_float16x8_to_private(a), - b_ = simde_float16x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { - r_.values[2 * i] = simde_float16_from_float32( - simde_float16_to_float32(r_.values[2 * i]) + - simde_float16_to_float32(b_.values[2 * i + 1]) * - simde_float16_to_float32(a_.values[2 * i + 1])); - r_.values[2 * i + 1] = simde_float16_from_float32( - simde_float16_to_float32(r_.values[2 * i + 1]) - - simde_float16_to_float32(b_.values[2 * i]) * - simde_float16_to_float32(a_.values[2 * i + 1])); - } - - return simde_float16x8_from_private(r_); - #endif -} -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot270_f16 - #define vcmlaq_rot270_f16(r, a, b) simde_vcmlaq_rot270_f16(r, a, b) -#endif - SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_rot270_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ - defined(SIMDE_ARCH_ARM_COMPLEX) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) return vcmla_rot270_f32(r, a, b); #else simde_float32x2_private @@ -131,18 +98,56 @@ simde_vcmla_rot270_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2 return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) #undef vcmla_rot270_f32 #define vcmla_rot270_f32(r, a, b) simde_vcmla_rot270_f32(r, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot270_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + return vcmlaq_rot270_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) + + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) - + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_f16 + #define vcmlaq_rot270_f16(r, a, b) simde_vcmlaq_rot270_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ - defined(SIMDE_ARCH_ARM_COMPLEX) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) return vcmlaq_rot270_f32(r, a, b); #else simde_float32x4_private @@ -169,7 +174,10 @@ simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) #undef vcmlaq_rot270_f32 #define vcmlaq_rot270_f32(r, a, b) simde_vcmlaq_rot270_f32(r, a, b) #endif @@ -177,7 +185,7 @@ simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcmlaq_rot270_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARCH_ARM_COMPLEX) @@ -207,7 +215,10 @@ simde_vcmlaq_rot270_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_rot270_f64 #define vcmlaq_rot270_f64(r, a, b) simde_vcmlaq_rot270_f64(r, a, b) #endif diff --git a/thirdparty/simde/arm/neon/cmla_rot270_lane.h b/thirdparty/simde/arm/neon/cmla_rot270_lane.h index c03f5c147..c11f00e4f 100644 --- a/thirdparty/simde/arm/neon/cmla_rot270_lane.h +++ b/thirdparty/simde/arm/neon/cmla_rot270_lane.h @@ -35,18 +35,22 @@ #include "get_low.h" #include "mul.h" #include "types.h" +#include "cmla_rot270.h" +#include "get_lane.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES -simde_float16x4_t simde_vcmla_rot270_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) +simde_float16x4_t +simde_vcmla_rot270_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH simde_float16x4_private r_ = simde_float16x4_to_private(r), a_ = simde_float16x4_to_private(a), - b_ = simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]); + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); uint16_t idx1[4] = {1, 1, 3, 3}; uint16_t idx2[4] = {5, 0, 7, 2}; vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ @@ -55,83 +59,89 @@ simde_float16x4_t simde_vcmla_rot270_lane_f16(simde_float16x4_t r, simde_float16 __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); return simde_float16x4_from_private(r_); - #else + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot270_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot270_lane_f16 - #define vcmla_rot270_lane_f16(r, a, b, lane) simde_vcmla_rot270_lane_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot270_lane_f16(r, a, b, lane) vcmla_rot270_lane_f16(r, a, b, lane) +#else + #define simde_vcmla_rot270_lane_f16(r, a, b, lane) simde_vcmla_rot270_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot270_lane_f16 + #define vcmla_rot270_lane_f16(r, a, b, lane) simde_vcmla_rot270_lane_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x2_t simde_vcmla_rot270_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) +simde_float32x2_t +simde_vcmla_rot270_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[2] = {1, 1}; - uint32_t idx2[2] = {3, 0}; - vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); - vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); - r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {3, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; - } + return simde_vcmla_rot270_f32(r, a, b_tmp); #endif - return simde_float32x2_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot270_lane_f32 - #define vcmla_rot270_lane_f32(r, a, b, lane) simde_vcmla_rot270_lane_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot270_lane_f32(r, a, b, lane) vcmla_rot270_lane_f32(r, a, b, lane) +#else + #define simde_vcmla_rot270_lane_f32(r, a, b, lane) simde_vcmla_rot270_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot270_lane_f32 + #define vcmla_rot270_lane_f32(r, a, b, lane) simde_vcmla_rot270_lane_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x8_t simde_vcmlaq_rot270_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) +simde_float16x8_t +simde_vcmlaq_rot270_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH simde_float16x8_private r_ = simde_float16x8_to_private(r), a_ = simde_float16x8_to_private(a), - b_ = simde_vdupq_n_f16(simde_float16x4_to_private(b).values[lane]); + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; uint16_t idx2[8] = {9, 0, 11, 2, 13, 4, 15, 6}; vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); @@ -142,51 +152,49 @@ simde_float16x8_t simde_vcmlaq_rot270_lane_f16(simde_float16x8_t r, simde_float1 __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); return simde_float16x8_from_private(r_); - #else + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); - a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1]; - r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1]; - r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1]; - r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot270_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot270_lane_f16 - #define vcmlaq_rot270_lane_f16(r, a, b, lane) simde_vcmlaq_rot270_lane_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot270_lane_f16(r, a, b, lane) vcmlaq_rot270_lane_f16(r, a, b, lane) +#else + #define simde_vcmlaq_rot270_lane_f16(r, a, b, lane) simde_vcmlaq_rot270_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_lane_f16 + #define vcmlaq_rot270_lane_f16(r, a, b, lane) simde_vcmlaq_rot270_lane_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x4_t simde_vcmlaq_rot270_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) +simde_float32x4_t +simde_vcmlaq_rot270_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); uint32_t idx1[4] = {1, 1, 3, 3}; uint32_t idx2[4] = {5, 0, 7, 2}; vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); @@ -196,38 +204,43 @@ simde_float32x4_t simde_vcmlaq_rot270_lane_f32(simde_float32x4_t r, simde_float3 vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; - } + return simde_vcmlaq_rot270_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); #endif - return simde_float32x4_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot270_lane_f32 - #define vcmlaq_rot270_lane_f32(r, a, b, lane) simde_vcmlaq_rot270_lane_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot270_lane_f32(r, a, b, lane) vcmlaq_rot270_lane_f32(r, a, b, lane) +#else + #define simde_vcmlaq_rot270_lane_f32(r, a, b, lane) simde_vcmlaq_rot270_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_lane_f32 + #define vcmlaq_rot270_lane_f32(r, a, b, lane) simde_vcmlaq_rot270_lane_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x4_t simde_vcmla_rot270_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) +simde_float16x4_t +simde_vcmla_rot270_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH simde_float16x4_private r_ = simde_float16x4_to_private(r), a_ = simde_float16x4_to_private(a), - b_ = simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]); + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); uint16_t idx1[4] = {1, 1, 3, 3}; uint16_t idx2[4] = {5, 0, 7, 2}; vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ @@ -236,83 +249,86 @@ simde_float16x4_t simde_vcmla_rot270_laneq_f16(simde_float16x4_t r, simde_float1 __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); return simde_float16x4_from_private(r_); - #else + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot270_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot270_laneq_f16 - #define vcmla_rot270_laneq_f16(r, a, b, lane) simde_vcmla_rot270_laneq_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot270_laneq_f16(r, a, b, lane) vcmla_rot270_laneq_f16(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot270_laneq_f16 + #define vcmla_rot270_laneq_f16(r, a, b, lane) simde_vcmla_rot270_laneq_f16(r, a, b, lane) +#endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x2_t simde_vcmla_rot270_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) +simde_float32x2_t +simde_vcmla_rot270_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[2] = {1, 1}; - uint32_t idx2[2] = {3, 0}; - vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); - vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); - r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {3, 0}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; - } + return simde_vcmla_rot270_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); #endif - return simde_float32x2_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot270_laneq_f32 - #define vcmla_rot270_laneq_f32(r, a, b, lane) simde_vcmla_rot270_laneq_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot270_laneq_f32(r, a, b, lane) vcmla_rot270_laneq_f32(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot270_laneq_f32 + #define vcmla_rot270_laneq_f32(r, a, b, lane) simde_vcmla_rot270_laneq_f32(r, a, b, lane) +#endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x8_t simde_vcmlaq_rot270_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, +simde_float16x8_t +simde_vcmlaq_rot270_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH simde_float16x8_private r_ = simde_float16x8_to_private(r), a_ = simde_float16x8_to_private(a), - b_ = simde_vdupq_n_f16(simde_float16x8_to_private(b).values[lane]); + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; uint16_t idx2[8] = {9, 0, 11, 2, 13, 4, 15, 6}; vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); @@ -323,51 +339,48 @@ simde_float16x8_t simde_vcmlaq_rot270_laneq_f16(simde_float16x8_t r, simde_float __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); return simde_float16x8_from_private(r_); - #else + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); - a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); - r_high.values += b_.values * a_high.values; - r_low.values += b_.values * a_low.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1]; - r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1]; - r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1]; - r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_high.values += b_.values * a_high.values; + r_low.values += b_.values * a_low.values; return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot270_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot270_laneq_f16 - #define vcmlaq_rot270_laneq_f16(r, a, b, lane) simde_vcmlaq_rot270_laneq_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot270_laneq_f16(r, a, b, lane) vcmlaq_rot270_laneq_f16(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_laneq_f16 + #define vcmlaq_rot270_laneq_f16(r, a, b, lane) simde_vcmlaq_rot270_laneq_f16(r, a, b, lane) +#endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x4_t simde_vcmlaq_rot270_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, - const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +simde_float32x4_t +simde_vcmlaq_rot270_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); uint32_t idx1[4] = {1, 1, 3, 3}; uint32_t idx2[4] = {5, 0, 7, 2}; vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); @@ -377,29 +390,30 @@ simde_float32x4_t simde_vcmlaq_rot270_laneq_f32(simde_float32x4_t r, simde_float vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; - } + return simde_vcmlaq_rot270_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); #endif - return simde_float32x4_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot270_laneq_f32 - #define vcmlaq_rot270_laneq_f32(r, a, b, lane) simde_vcmlaq_rot270_laneq_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot270_laneq_f32(r, a, b, lane) vcmlaq_rot270_laneq_f32(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot270_laneq_f32 + #define vcmlaq_rot270_laneq_f32(r, a, b, lane) simde_vcmlaq_rot270_laneq_f32(r, a, b, lane) +#endif SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/arm/neon/cmla_rot90.h b/thirdparty/simde/arm/neon/cmla_rot90.h index d16a09b20..73cf3d34b 100644 --- a/thirdparty/simde/arm/neon/cmla_rot90.h +++ b/thirdparty/simde/arm/neon/cmla_rot90.h @@ -37,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t simde_vcmla_rot90_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) @@ -63,49 +63,18 @@ simde_vcmla_rot90_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_ return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmla_rot90_f16 #define vcmla_rot90_f16(r, a, b) simde_vcmla_rot90_f16(r, a, b) #endif -SIMDE_FUNCTION_ATTRIBUTES -simde_float16x8_t -simde_vcmlaq_rot90_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ - defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) - return vcmlaq_rot90_f16(r, a, b); - #else - simde_float16x8_private - r_ = simde_float16x8_to_private(r), - a_ = simde_float16x8_to_private(a), - b_ = simde_float16x8_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { - r_.values[2 * i] = simde_float16_from_float32( - simde_float16_to_float32(r_.values[2 * i]) - - simde_float16_to_float32(b_.values[2 * i + 1]) * - simde_float16_to_float32(a_.values[2 * i + 1])); - r_.values[2 * i + 1] = simde_float16_from_float32( - simde_float16_to_float32(r_.values[2 * i + 1]) + - simde_float16_to_float32(b_.values[2 * i]) * - simde_float16_to_float32(a_.values[2 * i + 1])); - } - - return simde_float16x8_from_private(r_); - #endif -} -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot90_f16 - #define vcmlaq_rot90_f16(r, a, b) simde_vcmlaq_rot90_f16(r, a, b) -#endif - SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_rot90_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARCH_ARM_COMPLEX) @@ -131,15 +100,55 @@ simde_vcmla_rot90_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_ return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmla_rot90_f32 #define vcmla_rot90_f32(r, a, b) simde_vcmla_rot90_f32(r, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot90_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmlaq_rot90_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) - + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) + + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX))) + #undef vcmlaq_rot90_f16 + #define vcmlaq_rot90_f16(r, a, b) simde_vcmlaq_rot90_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARCH_ARM_COMPLEX) @@ -169,7 +178,10 @@ simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4 return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_rot90_f32 #define vcmlaq_rot90_f32(r, a, b) simde_vcmlaq_rot90_f32(r, a, b) #endif @@ -177,7 +189,7 @@ simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4 SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vcmlaq_rot90_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ defined(SIMDE_ARCH_ARM_COMPLEX) @@ -207,7 +219,10 @@ simde_vcmlaq_rot90_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2 return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX))) #undef vcmlaq_rot90_f64 #define vcmlaq_rot90_f64(r, a, b) simde_vcmlaq_rot90_f64(r, a, b) #endif diff --git a/thirdparty/simde/arm/neon/cmla_rot90_lane.h b/thirdparty/simde/arm/neon/cmla_rot90_lane.h index 7e4cbea29..bb530eaca 100644 --- a/thirdparty/simde/arm/neon/cmla_rot90_lane.h +++ b/thirdparty/simde/arm/neon/cmla_rot90_lane.h @@ -35,372 +35,385 @@ #include "get_low.h" #include "mul.h" #include "types.h" +#include "cmla_rot90.h" +#include "get_lane.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES -simde_float16x4_t simde_vcmla_rot90_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) +simde_float16x4_t +simde_vcmla_rot90_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - simde_float16x4_private r_ = simde_float16x4_to_private(r), - a_ = simde_float16x4_to_private(a), - b_ = simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]); - uint16_t idx1[4] = {1, 1, 3, 3}; - uint16_t idx2[4] = {1, 4, 3, 6}; - vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ - a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); - vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ - __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); - r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); - return simde_float16x4_from_private(r_); - #else + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {1, 4, 3, 6}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + #else + return simde_vcmla_rot90_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot90_lane_f16 - #define vcmla_rot90_lane_f16(r, a, b, lane) simde_vcmla_rot90_lane_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot90_lane_f16(r, a, b, lane) vcmla_rot90_lane_f16(r, a, b, lane) +#else + #define simde_vcmla_rot90_lane_f16(r, a, b, lane) simde_vcmla_rot90_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot90_lane_f16 + #define vcmla_rot90_lane_f16(r, a, b, lane) simde_vcmla_rot90_lane_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x2_t simde_vcmla_rot90_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) +simde_float32x2_t +simde_vcmla_rot90_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[2] = {1, 1}; - uint32_t idx2[2] = {1, 2}; - vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); - vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); - r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {1, 2}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; - } + return simde_vcmla_rot90_f32(r, a, b_tmp); #endif - return simde_float32x2_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot90_lane_f32 - #define vcmla_rot90_lane_f32(r, a, b, lane) simde_vcmla_rot90_lane_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmla_rot90_lane_f32(r, a, b, lane) vcmla_rot90_lane_f32(r, a, b, lane) +#else + #define simde_vcmla_rot90_lane_f32(r, a, b, lane) simde_vcmla_rot90_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot90_lane_f32 + #define vcmla_rot90_lane_f32(r, a, b, lane) simde_vcmla_rot90_lane_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x4_t simde_vcmla_rot90_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) +simde_float16x8_t +simde_vcmlaq_rot90_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - simde_float16x4_private r_ = simde_float16x4_to_private(r), - a_ = simde_float16x4_to_private(a), - b_ = simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]); - uint16_t idx1[4] = {1, 1, 3, 3}; - uint16_t idx2[4] = {1, 4, 3, 6}; - vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ - a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); - vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ - __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); - r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); - return simde_float16x4_from_private(r_); - #else - simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), - a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {1, 8, 3, 10, 5, 12, 7, 14}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); - r_.values += b_.values * a_.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; - } - #endif - return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot90_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x2_to_private(simde_vreinterpret_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot90_laneq_f16 - #define vcmla_rot90_laneq_f16(r, a, b, lane) simde_vcmla_rot90_laneq_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) - #define simde_vcmla_rot90_laneq_f16(r, a, b, lane) vcmla_rot90_laneq_f16(r, a, b, lane) + #define simde_vcmlaq_rot90_lane_f16(r, a, b, lane) vcmlaq_rot90_lane_f16(r, a, b, lane) +#else + #define simde_vcmlaq_rot90_lane_f16(r, a, b, lane) simde_vcmlaq_rot90_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_lane_u32(simde_vreinterpret_u32_f16(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot90_lane_f16 + #define vcmlaq_rot90_lane_f16(r, a, b, lane) simde_vcmlaq_rot90_lane_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x2_t simde_vcmla_rot90_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) - SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +simde_float32x4_t +simde_vcmlaq_rot90_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { - simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), - b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); - #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[2] = {1, 1}; - uint32_t idx2[2] = {1, 2}; - vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); - vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ - __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); - r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {1, 4, 3, 6}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; - } + return simde_vcmlaq_rot90_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x1_to_private(simde_vreinterpret_u64_f32(b)).values[lane]))); #endif - return simde_float32x2_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmla_rot90_laneq_f32 - #define vcmla_rot90_laneq_f32(r, a, b, lane) simde_vcmla_rot90_laneq_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) - #define simde_vcmla_rot90_laneq_f32(r, a, b, lane) vcmla_rot90_laneq_f32(r, a, b, lane) + #define simde_vcmlaq_rot90_lane_f32(r, a, b, lane) vcmlaq_rot90_lane_f32(r, a, b, lane) +#else + #define simde_vcmlaq_rot90_lane_f32(r, a, b, lane) simde_vcmlaq_rot90_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_lane_u64(simde_vreinterpret_u64_f32(b), lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot90_lane_f32 + #define vcmlaq_rot90_lane_f32(r, a, b, lane) simde_vcmlaq_rot90_lane_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x8_t simde_vcmlaq_rot90_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) +simde_float16x4_t +simde_vcmla_rot90_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - simde_float16x8_private r_ = simde_float16x8_to_private(r), - a_ = simde_float16x8_to_private(a), - b_ = simde_vdupq_n_f16(simde_float16x4_to_private(b).values[lane]); - uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; - uint16_t idx2[8] = {1, 8, 3, 10, 5, 12, 7, 14}; - vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); - vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); - vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ - a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); - vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ - __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); - r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); - return simde_float16x8_from_private(r_); + simde_float16x4_private r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[4] = {1, 1, 3, 3}; + uint16_t idx2[4] = {1, 4, 3, 6}; + vfloat16m1_t op1 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + a_.sv64, a_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx1, 4), 4); + vfloat16m1_t op2 = __riscv_vrgather_vv_f16m1(__riscv_vslideup_vx_f16m1( \ + __riscv_vfneg_v_f16m1(b_.sv64, 4), b_.sv64, 4, 8), __riscv_vle16_v_u16m1(idx2, 4), 4); + r_.sv64 = __riscv_vfmacc_vv_f16m1(r_.sv64, op1, op2, 4); + return simde_float16x4_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); #else - simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), - a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), - r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), - a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); - a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1]; - r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1]; - r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1]; - r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1]; - } - #endif - return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), - simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + return simde_vcmla_rot90_f16(r, a, simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot90_lane_f16 - #define vcmlaq_rot90_lane_f16(r, a, b, lane) simde_vcmlaq_rot90_lane_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) - #define simde_vcmlaq_rot90_lane_f16(r, a, b, lane) vcmlaq_rot90_lane_f16(r, a, b, lane) + #define simde_vcmla_rot90_laneq_f16(r, a, b, lane) vcmla_rot90_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot90_laneq_f16 + #define vcmla_rot90_laneq_f16(r, a, b, lane) simde_vcmla_rot90_laneq_f16(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x4_t simde_vcmlaq_rot90_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) - SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +simde_float32x2_t +simde_vcmla_rot90_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); + simde_float32x2_t b_tmp = simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[4] = {1, 1, 3, 3}; - uint32_t idx2[4] = {1, 4, 3, 6}; - vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); - vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); - vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ - a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); - vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ - __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); - r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + uint32_t idx1[2] = {1, 1}; + uint32_t idx2[2] = {1, 2}; + vfloat32m1_t op1 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + a_.sv64, a_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx1, 2), 2); + vfloat32m1_t op2 = __riscv_vrgather_vv_f32m1(__riscv_vslideup_vx_f32m1( \ + __riscv_vfneg_v_f32m1(b_.sv64, 2), b_.sv64, 2, 4), __riscv_vle32_v_u32m1(idx2, 2), 2); + r_.sv64 = __riscv_vfmacc_vv_f32m1(r_.sv64, op1, op2, 2); + return simde_float32x2_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) - a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + simde_float32x2_private r_ = simde_float32x2_to_private(r), + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b_tmp); + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); r_.values += b_.values * a_.values; + return simde_float32x2_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; - } + return simde_vcmla_rot90_f32(r, a, simde_vreinterpret_f32_u64(simde_vdup_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); #endif - return simde_float32x4_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot90_lane_f32 - #define vcmlaq_rot90_lane_f32(r, a, b, lane) simde_vcmlaq_rot90_lane_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) - #define simde_vcmlaq_rot90_lane_f32(r, a, b, lane) vcmlaq_rot90_lane_f32(r, a, b, lane) + #define simde_vcmla_rot90_laneq_f32(r, a, b, lane) vcmla_rot90_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmla_rot90_laneq_f32 + #define vcmla_rot90_laneq_f32(r, a, b, lane) simde_vcmla_rot90_laneq_f32(r, a, b, lane) #endif SIMDE_FUNCTION_ATTRIBUTES -simde_float16x8_t simde_vcmlaq_rot90_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) +simde_float16x8_t +simde_vcmlaq_rot90_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE > 128) - simde_float16x8_private r_ = simde_float16x8_to_private(r), - a_ = simde_float16x8_to_private(a), - b_ = simde_vdupq_n_f16(simde_float16x8_to_private(b).values[lane]); - uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; - uint16_t idx2[8] = {1, 8, 3, 10, 5, 12, 7, 14}; - vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); - vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); - vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ - a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); - vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ - __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); - r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); - return simde_float16x8_from_private(r_); - #else + simde_float16x8_private r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private( + simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); + uint16_t idx1[8] = {1, 1, 3, 3, 5, 5, 7, 7}; + uint16_t idx2[8] = {1, 8, 3, 10, 5, 12, 7, 14}; + vfloat16m2_t a_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (a_.sv128); + vfloat16m2_t b_tmp = __riscv_vlmul_ext_v_f16m1_f16m2 (b_.sv128); + vfloat16m1_t op1 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + a_tmp, a_tmp, 8, 16), __riscv_vle16_v_u16m2(idx1, 8), 8)); + vfloat16m1_t op2 = __riscv_vlmul_trunc_v_f16m2_f16m1(__riscv_vrgather_vv_f16m2(__riscv_vslideup_vx_f16m2( \ + __riscv_vfneg_v_f16m2(b_tmp, 8), b_tmp, 8, 16), __riscv_vle16_v_u16m2(idx2, 8), 8)); + r_.sv128 = __riscv_vfmacc_vv_f16m1(r_.sv128, op1, op2, 8); + return simde_float16x8_from_private(r_); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), - b_ = simde_float32x4_to_private( - simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); - #if defined(SIMDE_SHUFFLE_VECTOR_) && \ - ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) - a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); - a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); - b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); - r_low.values += b_.values * a_low.values; - r_high.values += b_.values * a_high.values; - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) - { - r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1]; - r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1]; - r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1]; - r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1]; - } - #endif + b_ = simde_float32x4_to_private(simde_vcvt_f32_f16( + simde_vreinterpret_f16_u32(simde_vdup_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane])))); + a_low.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); + #else + return simde_vcmlaq_rot90_f16(r, a, simde_vreinterpretq_f16_u32(simde_vdupq_n_u32(simde_uint32x4_to_private(simde_vreinterpretq_u32_f16(b)).values[lane]))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot90_laneq_f16 - #define vcmlaq_rot90_laneq_f16(r, a, b, lane) simde_vcmlaq_rot90_laneq_f16(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot90_laneq_f16(r, a, b, lane) vcmlaq_rot90_laneq_f16(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot90_laneq_f16 + #define vcmlaq_rot90_laneq_f16(r, a, b, lane) simde_vcmlaq_rot90_laneq_f16(r, a, b, lane) +#endif SIMDE_FUNCTION_ATTRIBUTES -simde_float32x4_t simde_vcmlaq_rot90_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane) +simde_float32x4_t +simde_vcmlaq_rot90_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), - b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); #if defined(SIMDE_RISCV_V_NATIVE) - uint32_t idx1[4] = {1, 1, 3, 3}; - uint32_t idx2[4] = {1, 4, 3, 6}; - vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); - vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); - vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ - a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); - vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ - __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); - r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + simde_float32x4_private r_ = simde_float32x4_to_private(r), + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); + uint32_t idx1[4] = {1, 1, 3, 3}; + uint32_t idx2[4] = {1, 4, 3, 6}; + vfloat32m2_t a_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (a_.sv128); + vfloat32m2_t b_tmp = __riscv_vlmul_ext_v_f32m1_f32m2 (b_.sv128); + vfloat32m1_t op1 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + a_tmp, a_tmp, 4, 8), __riscv_vle32_v_u32m2(idx1, 4), 4)); + vfloat32m1_t op2 = __riscv_vlmul_trunc_v_f32m2_f32m1(__riscv_vrgather_vv_f32m2(__riscv_vslideup_vx_f32m2( \ + __riscv_vfneg_v_f32m2(b_tmp, 4), b_tmp, 4, 8), __riscv_vle32_v_u32m2(idx2, 4), 4)); + r_.sv128 = __riscv_vfmacc_vv_f32m1(r_.sv128, op1, op2, 4); + return simde_float32x4_from_private(r_); #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_.values += b_.values * a_.values; + return simde_float32x4_from_private(r_); #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) - { - r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; - r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; - } + return simde_vcmlaq_rot90_f32(r, a, simde_vreinterpretq_f32_u64(simde_vdupq_n_u64(simde_uint64x2_to_private(simde_vreinterpretq_u64_f32(b)).values[lane]))); #endif - return simde_float32x4_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcmlaq_rot90_laneq_f32 - #define vcmlaq_rot90_laneq_f32(r, a, b, lane) simde_vcmlaq_rot90_laneq_f32(r, a, b, lane) -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) #define simde_vcmlaq_rot90_laneq_f32(r, a, b, lane) vcmlaq_rot90_laneq_f32(r, a, b, lane) #endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_COMPLEX) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)))) + #undef vcmlaq_rot90_laneq_f32 + #define vcmlaq_rot90_laneq_f32(r, a, b, lane) simde_vcmlaq_rot90_laneq_f32(r, a, b, lane) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/arm/neon/combine.h b/thirdparty/simde/arm/neon/combine.h index 3db44edfa..d54662fa4 100644 --- a/thirdparty/simde/arm/neon/combine.h +++ b/thirdparty/simde/arm/neon/combine.h @@ -59,7 +59,8 @@ simde_vcombine_f16(simde_float16x4_t low, simde_float16x4_t high) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcombine_f16 #define vcombine_f16(low, high) simde_vcombine_f16((low), (high)) #endif @@ -486,7 +487,8 @@ simde_vcombine_bf16(simde_bfloat16x4_t low, simde_bfloat16x4_t high) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcombine_bf16 #define vcombine_bf16(low, high) simde_vcombine_bf16((low), (high)) #endif diff --git a/thirdparty/simde/arm/neon/copy_lane.h b/thirdparty/simde/arm/neon/copy_lane.h index 7195c8076..6a57c44fe 100644 --- a/thirdparty/simde/arm/neon/copy_lane.h +++ b/thirdparty/simde/arm/neon/copy_lane.h @@ -869,7 +869,8 @@ simde_vcopy_lane_p8(simde_poly8x8_t a, const int lane1, simde_poly8x8_t b, const #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopy_lane_p8(a, lane1, b, lane2) vcopy_lane_p8((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopy_lane_p8 #define vcopy_lane_p8(a, lane1, b, lane2) simde_vcopy_lane_p8((a), (lane1), (b), (lane2)) #endif @@ -889,7 +890,8 @@ simde_vcopy_lane_p16(simde_poly16x4_t a, const int lane1, simde_poly16x4_t b, co #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopy_lane_p16(a, lane1, b, lane2) vcopy_lane_p16((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopy_lane_p16 #define vcopy_lane_p16(a, lane1, b, lane2) simde_vcopy_lane_p16((a), (lane1), (b), (lane2)) #endif @@ -909,7 +911,8 @@ simde_vcopy_lane_p64(simde_poly64x1_t a, const int lane1, simde_poly64x1_t b, co #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopy_lane_p64(a, lane1, b, lane2) vcopy_lane_p64((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopy_lane_p64 #define vcopy_lane_p64(a, lane1, b, lane2) simde_vcopy_lane_p64((a), (lane1), (b), (lane2)) #endif @@ -930,7 +933,8 @@ simde_vcopy_laneq_p8(simde_poly8x8_t a, const int lane1, simde_poly8x16_t b, con #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopy_laneq_p8(a, lane1, b, lane2) vcopy_laneq_p8((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopy_laneq_p8 #define vcopy_laneq_p8(a, lane1, b, lane2) simde_vcopy_laneq_p8((a), (lane1), (b), (lane2)) #endif @@ -951,7 +955,8 @@ simde_vcopy_laneq_p16(simde_poly16x4_t a, const int lane1, simde_poly16x8_t b, c #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopy_laneq_p16(a, lane1, b, lane2) vcopy_laneq_p16((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopy_laneq_p16 #define vcopy_laneq_p16(a, lane1, b, lane2) simde_vcopy_laneq_p16((a), (lane1), (b), (lane2)) #endif @@ -972,7 +977,8 @@ simde_vcopy_laneq_p64(simde_poly64x1_t a, const int lane1, simde_poly64x2_t b, c #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopy_laneq_p64(a, lane1, b, lane2) vcopy_laneq_p64((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopy_laneq_p64 #define vcopy_laneq_p64(a, lane1, b, lane2) simde_vcopy_laneq_p64((a), (lane1), (b), (lane2)) #endif @@ -993,7 +999,8 @@ simde_vcopyq_lane_p8(simde_poly8x16_t a, const int lane1, simde_poly8x8_t b, con #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopyq_lane_p8(a, lane1, b, lane2) vcopyq_lane_p8((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopyq_lane_p8 #define vcopyq_lane_p8(a, lane1, b, lane2) simde_vcopyq_lane_p8((a), (lane1), (b), (lane2)) #endif @@ -1014,7 +1021,8 @@ simde_vcopyq_lane_p16(simde_poly16x8_t a, const int lane1, simde_poly16x4_t b, c #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopyq_lane_p16(a, lane1, b, lane2) vcopyq_lane_p16((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopyq_lane_p16 #define vcopyq_lane_p16(a, lane1, b, lane2) simde_vcopyq_lane_p16((a), (lane1), (b), (lane2)) #endif @@ -1035,7 +1043,8 @@ simde_vcopyq_lane_p64(simde_poly64x2_t a, const int lane1, simde_poly64x1_t b, c #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopyq_lane_p64(a, lane1, b, lane2) vcopyq_lane_p64((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopyq_lane_p64 #define vcopyq_lane_p64(a, lane1, b, lane2) simde_vcopyq_lane_p64((a), (lane1), (b), (lane2)) #endif @@ -1055,7 +1064,8 @@ simde_vcopyq_laneq_p8(simde_poly8x16_t a, const int lane1, simde_poly8x16_t b, c #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopyq_laneq_p8(a, lane1, b, lane2) vcopyq_laneq_p8((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopyq_laneq_p8 #define vcopyq_laneq_p8(a, lane1, b, lane2) simde_vcopyq_laneq_p8((a), (lane1), (b), (lane2)) #endif @@ -1075,7 +1085,8 @@ simde_vcopyq_laneq_p16(simde_poly16x8_t a, const int lane1, simde_poly16x8_t b, #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopyq_laneq_p16(a, lane1, b, lane2) vcopyq_laneq_p16((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopyq_laneq_p16 #define vcopyq_laneq_p16(a, lane1, b, lane2) simde_vcopyq_laneq_p16((a), (lane1), (b), (lane2)) #endif @@ -1095,7 +1106,8 @@ simde_vcopyq_laneq_p64(simde_poly64x2_t a, const int lane1, simde_poly64x2_t b, #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vcopyq_laneq_p64(a, lane1, b, lane2) vcopyq_laneq_p64((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vcopyq_laneq_p64 #define vcopyq_laneq_p64(a, lane1, b, lane2) simde_vcopyq_laneq_p64((a), (lane1), (b), (lane2)) #endif @@ -1115,7 +1127,8 @@ simde_vcopy_lane_bf16(simde_bfloat16x4_t a, const int lane1, simde_bfloat16x4_t #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vcopy_lane_bf16(a, lane1, b, lane2) vcopy_lane_bf16((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcopy_lane_bf16 #define vcopy_lane_bf16(a, lane1, b, lane2) simde_vcopy_lane_bf16((a), (lane1), (b), (lane2)) #endif @@ -1134,7 +1147,8 @@ simde_vcopy_laneq_bf16(simde_bfloat16x4_t a, const int lane1, simde_bfloat16x8_t #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vcopy_laneq_bf16(a, lane1, b, lane2) vcopy_laneq_bf16((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcopy_laneq_bf16 #define vcopy_laneq_bf16(a, lane1, b, lane2) simde_vcopy_laneq_bf16((a), (lane1), (b), (lane2)) #endif @@ -1153,7 +1167,8 @@ simde_vcopyq_lane_bf16(simde_bfloat16x8_t a, const int lane1, simde_bfloat16x4_t #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vcopyq_lane_bf16(a, lane1, b, lane2) vcopyq_lane_bf16((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcopyq_lane_bf16 #define vcopyq_lane_bf16(a, lane1, b, lane2) simde_vcopyq_lane_bf16((a), (lane1), (b), (lane2)) #endif @@ -1173,7 +1188,8 @@ simde_vcopyq_laneq_bf16(simde_bfloat16x8_t a, const int lane1, simde_bfloat16x8_ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vcopyq_laneq_bf16(a, lane1, b, lane2) vcopyq_laneq_bf16((a), (lane1), (b), (lane2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcopyq_laneq_bf16 #define vcopyq_laneq_bf16(a, lane1, b, lane2) simde_vcopyq_laneq_bf16((a), (lane1), (b), (lane2)) #endif diff --git a/thirdparty/simde/arm/neon/crc32.h b/thirdparty/simde/arm/neon/crc32.h index 53ae13983..50f8f1424 100644 --- a/thirdparty/simde/arm/neon/crc32.h +++ b/thirdparty/simde/arm/neon/crc32.h @@ -78,7 +78,8 @@ simde___crc32b(uint32_t a, uint8_t b) { return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) #undef __crc32b #define __crc32b(a, b) simde___crc32b((a), (b)) #endif @@ -104,7 +105,8 @@ simde___crc32h(uint32_t a, uint16_t b) { return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) #undef __crc32h #define __crc32h(a, b) simde___crc32h((a), (b)) #endif @@ -129,7 +131,8 @@ simde___crc32w(uint32_t a, uint32_t b) { return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) #undef __crc32w #define __crc32w(a, b) simde___crc32w((a), (b)) #endif @@ -164,7 +167,8 @@ simde___crc32d(uint32_t a, uint64_t b) { return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) #undef __crc32d #define __crc32d(a, b) simde___crc32d((a), (b)) #endif @@ -190,7 +194,8 @@ simde___crc32cb(uint32_t a, uint8_t b) { return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) #undef __crc32cb #define __crc32cb(a, b) simde___crc32cb((a), (b)) #endif @@ -216,7 +221,8 @@ simde___crc32ch(uint32_t a, uint16_t b) { return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) #undef __crc32ch #define __crc32ch(a, b) simde___crc32ch((a), (b)) #endif @@ -241,7 +247,8 @@ simde___crc32cw(uint32_t a, uint32_t b) { return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) #undef __crc32cw #define __crc32cw(a, b) simde___crc32cw((a), (b)) #endif @@ -276,7 +283,8 @@ simde___crc32cd(uint32_t a, uint64_t b) { return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_CRC32))) #undef __crc32cd #define __crc32cd(a, b) simde___crc32cd((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/create.h b/thirdparty/simde/arm/neon/create.h index 5954922bb..4ba48cf0d 100644 --- a/thirdparty/simde/arm/neon/create.h +++ b/thirdparty/simde/arm/neon/create.h @@ -158,7 +158,8 @@ simde_vcreate_f16(uint64_t a) { return simde_vreinterpret_f16_u64(simde_vdup_n_u64(a)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcreate_f16 #define vcreate_f16(a) simde_vcreate_f16(a) #endif @@ -242,7 +243,8 @@ simde_vcreate_bf16(uint64_t a) { return simde_vreinterpret_bf16_u64(simde_vdup_n_u64(a)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcreate_bf16 #define vcreate_bf16(a) simde_vcreate_bf16(a) #endif diff --git a/thirdparty/simde/arm/neon/cvt.h b/thirdparty/simde/arm/neon/cvt.h index a4f280d78..96c39afe1 100644 --- a/thirdparty/simde/arm/neon/cvt.h +++ b/thirdparty/simde/arm/neon/cvt.h @@ -58,7 +58,8 @@ simde_vcvt_f16_f32(simde_float32x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_f16_f32 #define vcvt_f16_f32(a) simde_vcvt_f16_f32(a) #endif @@ -86,7 +87,8 @@ simde_vcvt_f32_f16(simde_float16x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_f32_f16 #define vcvt_f32_f16(a) simde_vcvt_f32_f16(a) #endif @@ -168,7 +170,8 @@ simde_vcvth_u16_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_u16_f16 #define vcvth_u16_f16(a) simde_vcvth_u16_f16(a) #endif @@ -194,7 +197,8 @@ simde_vcvth_s32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_s32_f16 #define vcvth_s32_f16(a) simde_vcvth_s32_f16(a) #endif @@ -220,7 +224,8 @@ simde_vcvth_u32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_u32_f16 #define vcvth_u32_f16(a) simde_vcvth_u32_f16(a) #endif @@ -246,7 +251,8 @@ simde_vcvth_s64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_s64_f16 #define vcvth_s64_f16(a) simde_vcvth_s64_f16(a) #endif @@ -272,7 +278,8 @@ simde_vcvth_u64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_u64_f16 #define vcvth_u64_f16(a) simde_vcvth_u64_f16(a) #endif @@ -320,7 +327,8 @@ simde_vcvts_u32_f32(simde_float32 a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvts_u32_f32 #define vcvts_u32_f32(a) simde_vcvts_u32_f32(a) #endif @@ -348,7 +356,8 @@ simde_vcvts_f32_u32 (uint32_t a) { return HEDLEY_STATIC_CAST(simde_float32, a); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvts_f32_u32 #define vcvts_f32_u32(a) simde_vcvts_f32_u32(a) #endif @@ -397,7 +406,8 @@ simde_vcvtd_u64_f64(simde_float64 a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtd_u64_f64 #define vcvtd_u64_f64(a) simde_vcvtd_u64_f64(a) #endif @@ -425,7 +435,8 @@ simde_vcvtd_f64_u64(uint64_t a) { return HEDLEY_STATIC_CAST(simde_float64, a); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtd_f64_u64 #define vcvtd_f64_u64(a) simde_vcvtd_f64_u64(a) #endif @@ -441,7 +452,8 @@ simde_vcvth_f16_u32(uint32_t a) { return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_f16_u32 #define vcvth_f16_u32(a) simde_vcvth_f16_u32(a) #endif @@ -457,7 +469,8 @@ simde_vcvth_f16_u64(uint64_t a) { return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_f16_u64 #define vcvth_f16_u64(a) simde_vcvth_f16_u64(a) #endif @@ -473,7 +486,8 @@ simde_vcvth_f16_s32(int32_t a) { return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_f16_s32 #define vcvth_f16_s32(a) simde_vcvth_f16_s32(a) #endif @@ -489,7 +503,8 @@ simde_vcvth_f16_s64(int64_t a) { return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_f16_s64 #define vcvth_f16_s64(a) simde_vcvth_f16_s64(a) #endif @@ -505,7 +520,8 @@ simde_vcvth_f16_s16(int16_t a) { return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_f16_s16 #define vcvth_f16_s16(a) simde_vcvth_f16_s16(a) #endif @@ -521,7 +537,8 @@ simde_vcvth_f16_u16(uint16_t a) { return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_f16_u16 #define vcvth_f16_u16(a) simde_vcvth_f16_u16(a) #endif @@ -585,7 +602,8 @@ simde_vcvt_u16_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_u16_f16 #define vcvt_u16_f16(a) simde_vcvt_u16_f16(a) #endif @@ -621,7 +639,8 @@ simde_vcvt_u32_f32(simde_float32x2_t a) { return simde_uint32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvt_u32_f32 #define vcvt_u32_f32(a) simde_vcvt_u32_f32(a) #endif @@ -694,7 +713,8 @@ simde_vcvt_u64_f64(simde_float64x1_t a) { return simde_uint64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvt_u64_f64 #define vcvt_u64_f64(a) simde_vcvt_u64_f64(a) #endif @@ -810,7 +830,8 @@ simde_vcvtq_u16_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtq_u16_f16 #define vcvtq_u16_f16(a) simde_vcvtq_u16_f16(a) #endif @@ -887,7 +908,8 @@ simde_vcvtq_u32_f32(simde_float32x4_t a) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_u32_f32 #define vcvtq_u32_f32(a) simde_vcvtq_u32_f32(a) #endif @@ -1064,7 +1086,8 @@ simde_vcvtq_u64_f64(simde_float64x2_t a) { return simde_uint64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_u64_f64 #define vcvtq_u64_f64(a) simde_vcvtq_u64_f64(a) #endif @@ -1096,7 +1119,8 @@ simde_vcvt_f16_s16(simde_int16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_f16_s16 #define vcvt_f16_s16(a) simde_vcvt_f16_s16(a) #endif @@ -1154,7 +1178,8 @@ simde_vcvt_f16_u16(simde_uint16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_f16_u16 #define vcvt_f16_u16(a) simde_vcvt_f16_u16(a) #endif @@ -1182,7 +1207,8 @@ simde_vcvt_f32_u32(simde_uint32x2_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvt_f32_u32 #define vcvt_f32_u32(a) simde_vcvt_f32_u32(a) #endif @@ -1238,7 +1264,8 @@ simde_vcvt_f64_u64(simde_uint64x1_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvt_f64_u64 #define vcvt_f64_u64(a) simde_vcvt_f64_u64(a) #endif @@ -1270,7 +1297,8 @@ simde_vcvtq_f16_s16(simde_int16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtq_f16_s16 #define vcvtq_f16_s16(a) simde_vcvtq_f16_s16(a) #endif @@ -1330,7 +1358,8 @@ simde_vcvtq_f16_u16(simde_uint16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16))) #undef vcvtq_f16_u16 #define vcvtq_f16_u16(a) simde_vcvtq_f16_u16(a) #endif @@ -1358,7 +1387,8 @@ simde_vcvtq_f32_u32(simde_uint32x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_f32_u32 #define vcvtq_f32_u32(a) simde_vcvtq_f32_u32(a) #endif @@ -1418,7 +1448,8 @@ simde_vcvtq_f64_u64(simde_uint64x2_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_f64_u64 #define vcvtq_f64_u64(a) simde_vcvtq_f64_u64(a) #endif @@ -1444,7 +1475,8 @@ simde_vcvtah_u16_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16))) #undef vcvtah_u16_f16 #define vcvtah_u16_f16(a) simde_vcvtah_u16_f16(a) #endif @@ -1470,7 +1502,8 @@ simde_vcvtah_s32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtah_s32_f16 #define vcvtah_s32_f16(a) simde_vcvtah_s32_f16(a) #endif @@ -1496,7 +1529,8 @@ simde_vcvtah_u32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16))) #undef vcvtah_u32_f16 #define vcvtah_u32_f16(a) simde_vcvtah_u32_f16(a) #endif @@ -1522,7 +1556,8 @@ simde_vcvtah_s64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtah_s64_f16 #define vcvtah_s64_f16(a) simde_vcvtah_s64_f16(a) #endif @@ -1548,7 +1583,8 @@ simde_vcvtah_u64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16))) #undef vcvtah_u64_f16 #define vcvtah_u64_f16(a) simde_vcvtah_u64_f16(a) #endif @@ -1596,7 +1632,8 @@ simde_vcvtad_u64_f64(simde_float64 a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtad_u64_f64 #define vcvtad_u64_f64(a) simde_vcvtad_u64_f64(a) #endif @@ -1667,7 +1704,8 @@ simde_vcvta_u16_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvta_u16_f16 #define vcvta_u16_f16(a) simde_vcvta_u16_f16(a) #endif @@ -1755,7 +1793,8 @@ simde_vcvtaq_u16_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtaq_u16_f16 #define vcvtaq_u16_f16(a) simde_vcvtaq_u16_f16(a) #endif @@ -1894,7 +1933,8 @@ simde_vcvt_high_f16_f32(simde_float16x4_t r, simde_float32x4_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_high_f16_f32 #define vcvt_high_f16_f32(r, a) simde_vcvt_high_f16_f32((r), (a)) #endif @@ -1922,7 +1962,8 @@ simde_vcvt_high_f32_f64(simde_float32x2_t r, simde_float64x2_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_high_f32_f64 #define vcvt_high_f32_f64(r, a) simde_vcvt_high_f32_f64((r), (a)) #endif @@ -1945,7 +1986,8 @@ simde_vcvt_high_f32_f16(simde_float16x8_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_high_f32_f16 #define vcvt_high_f32_f16(a) simde_vcvt_high_f32_f16(a) #endif @@ -2054,7 +2096,8 @@ simde_vcvt_bf16_f32(simde_float32x4_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcvt_bf16_f32 #define vcvt_bf16_f32(a) simde_vcvt_bf16_f32(a) #endif @@ -2076,7 +2119,8 @@ simde_vcvt_f32_bf16(simde_bfloat16x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcvt_f32_bf16 #define vcvt_f32_bf16(a) simde_vcvt_f32_bf16(a) #endif @@ -2090,7 +2134,8 @@ simde_vcvtah_f32_bf16(simde_bfloat16_t a) { return simde_bfloat16_to_float32(a); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcvtah_f32_bf16 #define vcvtah_f32_bf16(a) simde_vcvtah_f32_bf16(a) #endif @@ -2104,7 +2149,8 @@ simde_vcvth_bf16_f32(float a) { return simde_bfloat16_from_float32(a); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcvth_bf16_f32 #define vcvth_bf16_f32(a) simde_vcvth_bf16_f32(a) #endif @@ -2126,7 +2172,8 @@ simde_vcvtq_low_f32_bf16(simde_bfloat16x8_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcvtq_low_f32_bf16 #define vcvtq_low_f32_bf16(a) simde_vcvtq_low_f32_bf16(a) #endif @@ -2149,7 +2196,8 @@ simde_vcvtq_high_f32_bf16(simde_bfloat16x8_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcvtq_high_f32_bf16 #define vcvtq_high_f32_bf16(a) simde_vcvtq_high_f32_bf16(a) #endif @@ -2173,7 +2221,8 @@ simde_vcvtq_low_bf16_f32(simde_float32x4_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcvtq_low_bf16_f32 #define vcvtq_low_bf16_f32(a) simde_vcvtq_low_bf16_f32(a) #endif @@ -2197,7 +2246,8 @@ simde_vcvtq_high_bf16_f32(simde_bfloat16x8_t inactive, simde_float32x4_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vcvtq_high_bf16_f32 #define vcvtq_high_bf16_f32(inactive, a) simde_vcvtq_high_bf16_f32((inactive), (a)) #endif diff --git a/thirdparty/simde/arm/neon/cvt_n.h b/thirdparty/simde/arm/neon/cvt_n.h index 663ee5cfe..3574a3f6c 100644 --- a/thirdparty/simde/arm/neon/cvt_n.h +++ b/thirdparty/simde/arm/neon/cvt_n.h @@ -45,7 +45,8 @@ simde_vcvth_n_u16_f16(simde_float16_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vcvth_n_u16_f16(a, n) vcvth_n_u16_f16(a, n) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_n_u16_f16 #define vcvth_n_u16_f16(a, n) simde_vcvth_n_u16_f16(a, n) #endif @@ -61,7 +62,8 @@ simde_vcvth_n_f16_s16(int16_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vcvth_n_f16_s16(a, n) vcvth_n_f16_s16(a, n) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_n_f16_s16 #define vcvth_n_f16_s16(a, n) simde_vcvth_n_f16_s16(a, n) #endif @@ -77,7 +79,8 @@ simde_vcvth_n_f16_u16(uint16_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vcvth_n_f16_u16(a, n) vcvth_n_f16_u16(a, n) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvth_n_f16_u16 #define vcvth_n_f16_u16(a, n) simde_vcvth_n_f16_u16(a, n) #endif @@ -259,7 +262,8 @@ simde_vcvt_n_u16_f16(simde_float16x4_t a, const int n) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vcvt_n_u16_f16(a, n) vcvt_n_u16_f16((a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_n_u16_f16 #define vcvt_n_u16_f16(a, n) simde_vcvt_n_u16_f16((a), (n)) #endif @@ -303,7 +307,8 @@ simde_vcvt_n_u64_f64(simde_float64x1_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) #define simde_vcvt_n_u64_f64(a, n) vcvt_n_u64_f64((a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvt_n_u64_f64 #define vcvt_n_u64_f64(a, n) simde_vcvt_n_u64_f64((a), (n)) #endif @@ -371,7 +376,8 @@ simde_vcvtq_n_u16_f16(simde_float16x8_t a, const int n) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vcvtq_n_u16_f16(a, n) vcvtq_n_u16_f16((a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtq_n_u16_f16 #define vcvtq_n_u16_f16(a, n) simde_vcvtq_n_u16_f16((a), (n)) #endif @@ -393,7 +399,8 @@ simde_vcvtq_n_u32_f32(simde_float32x4_t a, const int n) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) #define simde_vcvtq_n_u32_f32(a, n) vcvtq_n_u32_f32((a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_n_u32_f32 #define vcvtq_n_u32_f32(a, n) simde_vcvtq_n_u32_f32((a), (n)) #endif @@ -415,7 +422,8 @@ simde_vcvtq_n_u64_f64(simde_float64x2_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) #define simde_vcvtq_n_u64_f64(a, n) vcvtq_n_u64_f64((a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtq_n_u64_f64 #define vcvtq_n_u64_f64(a, n) simde_vcvtq_n_u64_f64((a), (n)) #endif @@ -437,7 +445,8 @@ simde_vcvt_n_f16_u16(simde_uint16x4_t a, const int n) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vcvt_n_f16_u16(a, n) vcvt_n_f16_u16((a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_n_f16_u16 #define vcvt_n_f16_u16(a, n) simde_vcvt_n_f16_u16((a), (n)) #endif @@ -459,7 +468,8 @@ simde_vcvt_n_f16_s16(simde_int16x4_t a, const int n) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vcvt_n_f16_s16(a, n) vcvt_n_f16_s16((a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvt_n_f16_s16 #define vcvt_n_f16_s16(a, n) simde_vcvt_n_f16_s16((a), (n)) #endif @@ -481,7 +491,8 @@ simde_vcvtq_n_f16_u16(simde_uint16x8_t a, const int n) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vcvtq_n_f16_u16(a, n) vcvtq_n_f16_u16((a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtq_n_f16_u16 #define vcvtq_n_f16_u16(a, n) simde_vcvtq_n_f16_u16((a), (n)) #endif @@ -503,7 +514,8 @@ simde_vcvtq_n_f16_s16(simde_int16x8_t a, const int n) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vcvtq_n_f16_s16(a, n) vcvtq_n_f16_s16((a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtq_n_f16_s16 #define vcvtq_n_f16_s16(a, n) simde_vcvtq_n_f16_s16((a), (n)) #endif diff --git a/thirdparty/simde/arm/neon/cvtm.h b/thirdparty/simde/arm/neon/cvtm.h index ae2c98ae0..bf0aed7bb 100644 --- a/thirdparty/simde/arm/neon/cvtm.h +++ b/thirdparty/simde/arm/neon/cvtm.h @@ -56,7 +56,8 @@ simde_vcvtmh_s64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtmh_s64_f16 #define vcvtmh_s64_f16(a) simde_vcvtmh_s64_f16(a) #endif @@ -83,7 +84,8 @@ simde_vcvtmh_s32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtmh_s32_f16 #define vcvtmh_s32_f16(a) simde_vcvtmh_s32_f16(a) #endif @@ -110,7 +112,8 @@ simde_vcvtmh_u64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtmh_u64_f16 #define vcvtmh_u64_f16(a) simde_vcvtmh_u64_f16(a) #endif @@ -137,7 +140,8 @@ simde_vcvtmh_u32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtmh_u32_f16 #define vcvtmh_u32_f16(a) simde_vcvtmh_u32_f16(a) #endif @@ -164,7 +168,8 @@ simde_vcvtmh_u16_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtmh_u16_f16 #define vcvtmh_u16_f16(a) simde_vcvtmh_u16_f16(a) #endif @@ -234,7 +239,8 @@ simde_vcvtmq_u16_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtmq_u16_f16 #define vcvtmq_u16_f16(a) simde_vcvtmq_u16_f16(a) #endif @@ -269,7 +275,8 @@ simde_vcvtmq_u32_f32(simde_float32x4_t a) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtmq_u32_f32 #define vcvtmq_u32_f32(a) simde_vcvtmq_u32_f32(a) #endif @@ -326,7 +333,8 @@ simde_vcvtm_u16_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtm_u16_f16 #define vcvtm_u16_f16(a) simde_vcvtm_u16_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/cvtn.h b/thirdparty/simde/arm/neon/cvtn.h index 8198a9721..1363989f8 100644 --- a/thirdparty/simde/arm/neon/cvtn.h +++ b/thirdparty/simde/arm/neon/cvtn.h @@ -123,7 +123,8 @@ simde_vcvtnh_s64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtnh_s64_f16 #define vcvtnh_s64_f16(a) simde_vcvtnh_s64_f16(a) #endif @@ -148,7 +149,8 @@ simde_vcvtnh_s32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtnh_s32_f16 #define vcvtnh_s32_f16(a) simde_vcvtnh_s32_f16(a) #endif @@ -173,7 +175,8 @@ simde_vcvtnh_u64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtnh_u64_f16 #define vcvtnh_u64_f16(a) simde_vcvtnh_u64_f16(a) #endif @@ -198,7 +201,8 @@ simde_vcvtnh_u32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtnh_u32_f16 #define vcvtnh_u32_f16(a) simde_vcvtnh_u32_f16(a) #endif @@ -223,7 +227,8 @@ simde_vcvtnh_u16_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtnh_u16_f16 #define vcvtnh_u16_f16(a) simde_vcvtnh_u16_f16(a) #endif @@ -305,7 +310,8 @@ simde_vcvtnq_u32_f32(simde_float32x4_t a) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtnq_u32_f32 #define vcvtnq_u32_f32(a) simde_vcvtnq_u32_f32(a) #endif @@ -409,7 +415,8 @@ simde_vcvtnq_u16_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtnq_u16_f16 #define vcvtnq_u16_f16(a) simde_vcvtnq_u16_f16(a) #endif @@ -431,7 +438,8 @@ simde_vcvtn_u16_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtn_u16_f16 #define vcvtn_u16_f16(a) simde_vcvtn_u16_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/cvtp.h b/thirdparty/simde/arm/neon/cvtp.h index 92bcb2b99..5c29f5dbc 100644 --- a/thirdparty/simde/arm/neon/cvtp.h +++ b/thirdparty/simde/arm/neon/cvtp.h @@ -56,7 +56,8 @@ simde_vcvtph_s64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtph_s64_f16 #define vcvtph_s64_f16(a) simde_vcvtph_s64_f16(a) #endif @@ -83,7 +84,8 @@ simde_vcvtph_s32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtph_s32_f16 #define vcvtph_s32_f16(a) simde_vcvtph_s32_f16(a) #endif @@ -110,7 +112,8 @@ simde_vcvtph_u64_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtph_u64_f16 #define vcvtph_u64_f16(a) simde_vcvtph_u64_f16(a) #endif @@ -137,7 +140,8 @@ simde_vcvtph_u32_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtph_u32_f16 #define vcvtph_u32_f16(a) simde_vcvtph_u32_f16(a) #endif @@ -164,7 +168,8 @@ simde_vcvtph_u16_f16(simde_float16_t a) { } #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtph_u16_f16 #define vcvtph_u16_f16(a) simde_vcvtph_u16_f16(a) #endif @@ -234,7 +239,8 @@ simde_vcvtpq_u16_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtpq_u16_f16 #define vcvtpq_u16_f16(a) simde_vcvtpq_u16_f16(a) #endif @@ -268,7 +274,8 @@ simde_vcvtpq_u32_f32(simde_float32x4_t a) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_46844)) #undef vcvtpq_u32_f32 #define vcvtpq_u32_f32(a) simde_vcvtpq_u32_f32(a) #endif @@ -324,7 +331,8 @@ simde_vcvtp_u16_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vcvtp_u16_f16 #define vcvtp_u16_f16(a) simde_vcvtp_u16_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/div.h b/thirdparty/simde/arm/neon/div.h index 05a59084b..fed252dbf 100644 --- a/thirdparty/simde/arm/neon/div.h +++ b/thirdparty/simde/arm/neon/div.h @@ -44,7 +44,8 @@ simde_vdivh_f16(simde_float16_t a, simde_float16_t b) { return simde_float16_from_float32(simde_float16_to_float32(a) / simde_float16_to_float32(b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdivh_f16 #define vdivh_f16(a, b) simde_vdivh_f16((a), (b)) #endif @@ -68,7 +69,8 @@ simde_vdiv_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdiv_f16 #define vdiv_f16(a, b) simde_vdiv_f16((a), (b)) #endif @@ -92,7 +94,8 @@ simde_vdivq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdivq_f16 #define vdivq_f16(a, b) simde_vdivq_f16((a), (b)) #endif @@ -196,4 +199,4 @@ simde_vdivq_f64(simde_float64x2_t a, simde_float64x2_t b) { SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP -#endif /* !defined(SIMDE_ARM_NEON_MUL_H) */ +#endif /* !defined(SIMDE_ARM_NEON_DIV_H) */ diff --git a/thirdparty/simde/arm/neon/dot.h b/thirdparty/simde/arm/neon/dot.h index f195710be..6ebe7d6da 100644 --- a/thirdparty/simde/arm/neon/dot.h +++ b/thirdparty/simde/arm/neon/dot.h @@ -84,7 +84,8 @@ simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) { return simde_vadd_s32(r, simde_int32x2_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_s32 #define vdot_s32(r, a, b) simde_vdot_s32((r), (a), (b)) #endif @@ -130,7 +131,8 @@ simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) { return simde_vadd_u32(r, simde_uint32x2_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_u32 #define vdot_u32(r, a, b) simde_vdot_u32((r), (a), (b)) #endif @@ -179,7 +181,8 @@ simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { return simde_vaddq_s32(r, simde_int32x4_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_s32 #define vdotq_s32(r, a, b) simde_vdotq_s32((r), (a), (b)) #endif @@ -228,7 +231,8 @@ simde_vdotq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) { return simde_vaddq_u32(r, simde_uint32x4_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_u32 #define vdotq_u32(r, a, b) simde_vdotq_u32((r), (a), (b)) #endif @@ -255,7 +259,8 @@ simde_vbfdot_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x4_t b return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vbfdot_f32 #define vbfdot_f32(r, a, b) simde_vbfdot_f32((r), (a), (b)) #endif @@ -282,7 +287,9 @@ simde_vbfdotq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) #undef vbfdotq_f32 #define vbfdotq_f32(r, a, b) simde_vbfdotq_f32((r), (a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/dot_lane.h b/thirdparty/simde/arm/neon/dot_lane.h index 71378e9ef..0cc312b35 100644 --- a/thirdparty/simde/arm/neon/dot_lane.h +++ b/thirdparty/simde/arm/neon/dot_lane.h @@ -100,7 +100,8 @@ simde_vdot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b, const return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_lane_s32 #define vdot_lane_s32(r, a, b, lane) simde_vdot_lane_s32((r), (a), (b), (lane)) #endif @@ -164,7 +165,8 @@ simde_vdot_lane_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b, co return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_lane_u32 #define vdot_lane_u32(r, a, b, lane) simde_vdot_lane_u32((r), (a), (b), (lane)) #endif @@ -226,7 +228,8 @@ simde_vdot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x16_t b, con return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_laneq_s32 #define vdot_laneq_s32(r, a, b, lane) simde_vdot_laneq_s32((r), (a), (b), (lane)) #endif @@ -287,7 +290,8 @@ simde_vdot_laneq_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x16_t b, #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdot_laneq_u32 #define vdot_laneq_u32(r, a, b, lane) simde_vdot_laneq_u32((r), (a), (b), (lane)) #endif @@ -365,7 +369,8 @@ simde_vdotq_laneq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_laneq_u32 #define vdotq_laneq_u32(r, a, b, lane) simde_vdotq_laneq_u32((r), (a), (b), (lane)) #endif @@ -447,7 +452,8 @@ simde_vdotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b, c #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_laneq_s32 #define vdotq_laneq_s32(r, a, b, lane) simde_vdotq_laneq_s32((r), (a), (b), (lane)) #endif @@ -523,7 +529,8 @@ simde_vdotq_lane_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x8_t b, #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_lane_u32 #define vdotq_lane_u32(r, a, b, lane) simde_vdotq_lane_u32((r), (a), (b), (lane)) #endif @@ -600,7 +607,8 @@ simde_vdotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x8_t b, con #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD))) #undef vdotq_lane_s32 #define vdotq_lane_s32(r, a, b, lane) simde_vdotq_lane_s32((r), (a), (b), (lane)) #endif @@ -632,7 +640,9 @@ simde_vbfdot_lane_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) #undef vbfdot_lane_f32 #define vbfdot_lane_f32(r, a, b, lane) simde_vbfdot_lane_f32((r), (a), (b), (lane)) #endif @@ -663,7 +673,9 @@ simde_vbfdotq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16 return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) #undef vbfdotq_lane_f32 #define vbfdotq_lane_f32(r, a, b, lane) simde_vbfdotq_lane_f32((r), (a), (b), (lane)) #endif @@ -694,7 +706,9 @@ simde_vbfdot_laneq_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16 return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) #undef vbfdot_laneq_f32 #define vbfdot_laneq_f32(r, a, b, lane) simde_vbfdot_laneq_f32((r), (a), (b), (lane)) #endif @@ -726,7 +740,9 @@ simde_vbfdotq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat1 return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16))) #undef vbfdotq_laneq_f32 #define vbfdotq_laneq_f32(r, a, b, lane) simde_vbfdotq_laneq_f32((r), (a), (b), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/dup_lane.h b/thirdparty/simde/arm/neon/dup_lane.h index 44db662be..4d013b109 100644 --- a/thirdparty/simde/arm/neon/dup_lane.h +++ b/thirdparty/simde/arm/neon/dup_lane.h @@ -156,7 +156,8 @@ simde_vduph_lane_f16(simde_float16x4_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vduph_lane_f16(vec, lane) vduph_lane_f16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vduph_lane_f16 #define vduph_lane_f16(vec, lane) simde_vduph_lane_f16((vec), (lane)) #endif @@ -167,7 +168,8 @@ simde_vduph_lane_f16(simde_float16x4_t vec, const int lane) #else #define simde_vdup_lane_f16(vec, lane) simde_vdup_n_f16(simde_vduph_lane_f16(vec, lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdup_lane_f16 #define vdup_lane_f16(vec, lane) simde_vdup_lane_f16((vec), (lane)) #endif @@ -181,7 +183,8 @@ simde_vdup_laneq_f16(simde_float16x8_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vdup_laneq_f16(vec, lane) vdup_laneq_f16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdup_laneq_f16 #define vdup_laneq_f16(vec, lane) simde_vdup_laneq_f16((vec), (lane)) #endif @@ -195,7 +198,8 @@ simde_vdupq_lane_f16(simde_float16x4_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vdupq_lane_f16(vec, lane) vdupq_lane_f16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdupq_lane_f16 #define vdupq_lane_f16(vec, lane) simde_vdupq_lane_f16((vec), (lane)) #endif @@ -987,7 +991,8 @@ simde_vdupq_laneq_f16(simde_float16x8_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vdupq_laneq_f16(vec, lane) vdupq_laneq_f16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdupq_laneq_f16 #define vdupq_laneq_f16(vec, lane) simde_vdupq_laneq_f16((vec), (lane)) #endif @@ -1383,7 +1388,8 @@ simde_vduph_laneq_f16(simde_float16x8_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vduph_laneq_f16(vec, lane) vduph_laneq_f16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vduph_laneq_f16 #define vduph_laneq_f16(vec, lane) simde_vduph_laneq_f16((vec), (lane)) #endif @@ -1565,7 +1571,8 @@ simde_vdupb_lane_p8(simde_poly8x8_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vdupb_lane_p8(vec, lane) vdupb_lane_p8((vec), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vdupb_lane_p8 #define vdupb_lane_p8(vec, lane) simde_vdupb_lane_p8((vec), (lane)) #endif @@ -1579,7 +1586,8 @@ simde_vdupb_laneq_p8(simde_poly8x16_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vdupb_laneq_p8(vec, lane) vdupb_laneq_p8((vec), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vdupb_laneq_p8 #define vdupb_laneq_p8(vec, lane) simde_vdupb_laneq_p8((vec), (lane)) #endif @@ -1593,7 +1601,8 @@ simde_vduph_lane_p16(simde_poly16x4_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vduph_lane_p16(vec, lane) vduph_lane_p16((vec), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vduph_lane_p16 #define vduph_lane_p16(vec, lane) simde_vduph_lane_p16((vec), (lane)) #endif @@ -1607,7 +1616,8 @@ simde_vduph_laneq_p16(simde_poly16x8_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vduph_laneq_p16(vec, lane) vduph_laneq_p16((vec), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vduph_laneq_p16 #define vduph_laneq_p16(vec, lane) simde_vduph_laneq_p16((vec), (lane)) #endif @@ -1621,7 +1631,8 @@ simde_vduph_lane_bf16(simde_bfloat16x4_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vduph_lane_bf16(vec, lane) vduph_lane_bf16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vduph_lane_bf16 #define vduph_lane_bf16(vec, lane) simde_vduph_lane_bf16((vec), (lane)) #endif @@ -1635,7 +1646,8 @@ simde_vduph_laneq_bf16(simde_bfloat16x8_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vduph_laneq_bf16(vec, lane) vduph_laneq_bf16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vduph_laneq_bf16 #define vduph_laneq_bf16(vec, lane) simde_vduph_laneq_bf16((vec), (lane)) #endif @@ -1646,7 +1658,8 @@ simde_vduph_laneq_bf16(simde_bfloat16x8_t vec, const int lane) #else #define simde_vdup_lane_bf16(vec, lane) simde_vdup_n_bf16(simde_vduph_lane_bf16(vec, lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vdup_lane_bf16 #define vdup_lane_bf16(vec, lane) simde_vdup_lane_bf16((vec), (lane)) #endif @@ -1660,7 +1673,8 @@ simde_vdup_laneq_bf16(simde_bfloat16x8_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vdup_laneq_bf16(vec, lane) vdup_laneq_bf16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vdup_laneq_bf16 #define vdup_laneq_bf16(vec, lane) simde_vdup_laneq_bf16((vec), (lane)) #endif @@ -1674,7 +1688,8 @@ simde_vdupq_lane_bf16(simde_bfloat16x4_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vdupq_lane_bf16(vec, lane) vdupq_lane_bf16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vdupq_lane_bf16 #define vdupq_lane_bf16(vec, lane) simde_vdupq_lane_bf16((vec), (lane)) #endif @@ -1688,7 +1703,8 @@ simde_vdupq_laneq_bf16(simde_bfloat16x8_t vec, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vdupq_laneq_bf16(vec, lane) vdupq_laneq_bf16(vec, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vdupq_laneq_bf16 #define vdupq_laneq_bf16(vec, lane) simde_vdupq_laneq_bf16((vec), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/dup_n.h b/thirdparty/simde/arm/neon/dup_n.h index 61b06a3dc..663667ce8 100644 --- a/thirdparty/simde/arm/neon/dup_n.h +++ b/thirdparty/simde/arm/neon/dup_n.h @@ -55,7 +55,8 @@ simde_vdup_n_f16(simde_float16_t value) { #endif } #define simde_vmov_n_f16 simde_vdup_n_f16 -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdup_n_f16 #define vdup_n_f16(value) simde_vdup_n_f16((value)) #undef vmov_n_f16 @@ -366,7 +367,8 @@ simde_vdupq_n_f16(simde_float16_t value) { #endif } #define simde_vmovq_n_f16 simde_vdupq_n_f16 -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdupq_n_f16 #define vdupq_n_f16(value) simde_vdupq_n_f16((value)) #undef vmovq_n_f16 @@ -869,7 +871,8 @@ simde_vdup_n_bf16(simde_bfloat16_t value) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdup_n_bf16 #define vdup_n_bf16(value) simde_vdup_n_bf16((value)) #endif @@ -890,7 +893,8 @@ simde_vdupq_n_bf16(simde_bfloat16_t value) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vdupq_n_bf16 #define vdupq_n_bf16(value) simde_vdupq_n_bf16((value)) #endif diff --git a/thirdparty/simde/arm/neon/eor.h b/thirdparty/simde/arm/neon/eor.h index 9bb53b479..50791d1c9 100644 --- a/thirdparty/simde/arm/neon/eor.h +++ b/thirdparty/simde/arm/neon/eor.h @@ -608,7 +608,8 @@ simde_veor3q_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { return simde_int8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) #undef veor3q_s8 #define veor3q_s8(a, b, c) simde_veor3q_s8((a), (b), (c)) #endif @@ -639,7 +640,8 @@ simde_veor3q_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { return simde_int16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) #undef veor3q_s16 #define veor3q_s16(a, b, c) simde_veor3q_s16((a), (b), (c)) #endif @@ -670,7 +672,8 @@ simde_veor3q_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { return simde_int32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) #undef veor3q_s32 #define veor3q_s32(a, b, c) simde_veor3q_s32((a), (b), (c)) #endif @@ -701,7 +704,8 @@ simde_veor3q_s64(simde_int64x2_t a, simde_int64x2_t b, simde_int64x2_t c) { return simde_int64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) #undef veor3q_s64 #define veor3q_s64(a, b, c) simde_veor3q_s64((a), (b), (c)) #endif @@ -732,7 +736,8 @@ simde_veor3q_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { return simde_uint8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) #undef veor3q_u8 #define veor3q_u8(a, b, c) simde_veor3q_u8((a), (b), (c)) #endif @@ -763,7 +768,8 @@ simde_veor3q_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) #undef veor3q_u16 #define veor3q_u16(a, b, c) simde_veor3q_u16((a), (b), (c)) #endif @@ -794,7 +800,8 @@ simde_veor3q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) #undef veor3q_u32 #define veor3q_u32(a, b, c) simde_veor3q_u32((a), (b), (c)) #endif @@ -825,7 +832,8 @@ simde_veor3q_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { return simde_uint64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_SHA3))) #undef veor3q_u64 #define veor3q_u64(a, b, c) simde_veor3q_u64((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/ext.h b/thirdparty/simde/arm/neon/ext.h index 37003a322..c7ef81cec 100644 --- a/thirdparty/simde/arm/neon/ext.h +++ b/thirdparty/simde/arm/neon/ext.h @@ -53,15 +53,17 @@ simde_vext_f16(simde_float16x4_t a, simde_float16x4_t b, const int n) r_.sv64 = __riscv_vslideup_vx_f16m1(a_.sv64, b_.sv64, 4-n, 4); #else const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 3]; } #endif return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vext_f16 #define vext_f16(a, b, n) simde_vext_f16((a), (b), (n)) #endif @@ -499,15 +501,17 @@ simde_vextq_f16(simde_float16x8_t a, simde_float16x8_t b, const int n) r_.sv128 = __riscv_vslideup_vx_f16m1(a_.sv128, b_.sv128, 8-n, 8); #else const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 7]; } #endif return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vextq_f16 #define vextq_f16(a, b, n) simde_vextq_f16((a), (b), (n)) #endif @@ -548,7 +552,7 @@ simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ simde_float32x4_from_private(simde_vextq_f32_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_f32(a, b, n) (__extension__ ({ \ simde_float32x4_private simde_vextq_f32_r_; \ simde_vextq_f32_r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, simde_float32x4_to_private(a).values, simde_float32x4_to_private(b).values, \ @@ -610,6 +614,11 @@ simde_vextq_f64(simde_float64x2_t a, simde_float64x2_t b, const int n) #define vextq_f64(a, b, n) simde_vextq_f64((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_int8x16_t simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n) @@ -652,7 +661,7 @@ simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 14)), HEDLEY_STATIC_CAST(int8_t, ((n) + 15))); \ simde_int8x16_from_private(simde_vextq_s8_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s8(a, b, n) (__extension__ ({ \ simde_int8x16_private simde_vextq_s8_r_; \ simde_vextq_s8_r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, simde_int8x16_to_private(a).values, simde_int8x16_to_private(b).values, \ @@ -710,7 +719,7 @@ simde_vextq_s16(simde_int16x8_t a, simde_int16x8_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7))); \ simde_int16x8_from_private(simde_vextq_s16_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s16(a, b, n) (__extension__ ({ \ simde_int16x8_private simde_vextq_s16_r_; \ simde_vextq_s16_r_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, simde_int16x8_to_private(a).values, simde_int16x8_to_private(b).values, \ @@ -762,7 +771,7 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ simde_int32x4_from_private(simde_vextq_s32_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s32(a, b, n) (__extension__ ({ \ simde_int32x4_private simde_vextq_s32_r_; \ simde_vextq_s32_r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, simde_int32x4_to_private(a).values, simde_int32x4_to_private(b).values, \ @@ -776,6 +785,10 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n) #define vextq_s32(a, b, n) simde_vextq_s32((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) @@ -811,7 +824,7 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1))); \ simde_int64x2_from_private(simde_vextq_s64_r_); \ })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_s64(a, b, n) (__extension__ ({ \ simde_int64x2_private simde_vextq_s64_r_; \ simde_vextq_s64_r_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, simde_int64x2_to_private(a).values, simde_int64x2_to_private(b).values, \ @@ -824,6 +837,11 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) #define vextq_s64(a, b, n) simde_vextq_s64((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint8x16_t simde_vextq_u8(simde_uint8x16_t a, simde_uint8x16_t b, const int n) @@ -852,7 +870,7 @@ simde_vextq_u8(simde_uint8x16_t a, simde_uint8x16_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u8(a, b, n) simde_uint8x16_from_m128i(_mm_alignr_epi8(simde_uint8x16_to_m128i(b), simde_uint8x16_to_m128i(a), n * sizeof(uint8_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u8(a, b, n) (__extension__ ({ \ simde_uint8x16_private simde_vextq_u8_r_; \ simde_vextq_u8_r_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, simde_uint8x16_to_private(a).values, simde_uint8x16_to_private(b).values, \ @@ -900,7 +918,7 @@ simde_vextq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u16(a, b, n) simde_uint16x8_from_m128i(_mm_alignr_epi8(simde_uint16x8_to_m128i(b), simde_uint16x8_to_m128i(a), n * sizeof(uint16_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u16(a, b, n) (__extension__ ({ \ simde_uint16x8_private simde_vextq_u16_r_; \ simde_vextq_u16_r_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, simde_uint16x8_to_private(a).values, simde_uint16x8_to_private(b).values, \ @@ -910,7 +928,7 @@ simde_vextq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int n) HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7))); \ simde_uint16x8_from_private(simde_vextq_u16_r_); \ })) -#elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) +#elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u16(a, b, n) (__extension__ ({ \ simde_uint16x8_private r_; \ r_.values = __builtin_shufflevector( \ @@ -953,7 +971,7 @@ simde_vextq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u32(a, b, n) simde_uint32x4_from_m128i(_mm_alignr_epi8(simde_uint32x4_to_m128i(b), simde_uint32x4_to_m128i(a), n * sizeof(uint32_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u32(a, b, n) (__extension__ ({ \ simde_uint32x4_private simde_vextq_u32_r_; \ simde_vextq_u32_r_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, simde_uint32x4_to_private(a).values, simde_uint32x4_to_private(b).values, \ @@ -967,6 +985,10 @@ simde_vextq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int n) #define vextq_u32(a, b, n) simde_vextq_u32((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH)) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) @@ -995,7 +1017,7 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_u64(a, b, n) simde_uint64x2_from_m128i(_mm_alignr_epi8(simde_uint64x2_to_m128i(b), simde_uint64x2_to_m128i(a), n * sizeof(uint64_t))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) +#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_121064) #define simde_vextq_u64(a, b, n) (__extension__ ({ \ simde_uint64x2_private simde_vextq_u64_r_; \ simde_vextq_u64_r_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, simde_uint64x2_to_private(a).values, simde_uint64x2_to_private(b).values, \ @@ -1008,6 +1030,11 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) #define vextq_u64(a, b, n) simde_vextq_u64((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_RISCV64) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_poly8x8_t simde_vext_p8(simde_poly8x8_t a, simde_poly8x8_t b, const int n) @@ -1060,6 +1087,11 @@ simde_vext_p16(simde_poly16x4_t a, simde_poly16x4_t b, const int n) #define vext_p16(a, b, n) simde_vext_p16((a), (b), (n)) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_) && defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_RISCV64) +HEDLEY_DIAGNOSTIC_POP +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_poly64x1_t simde_vext_p64(simde_poly64x1_t a, simde_poly64x1_t b, const int n) @@ -1099,9 +1131,10 @@ simde_vextq_p8(simde_poly8x16_t a, simde_poly8x16_t b, const int n) b_ = simde_poly8x16_to_private(b), r_ = a_; const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 15]; } return simde_poly8x16_from_private(r_); #endif @@ -1125,9 +1158,10 @@ simde_vextq_p16(simde_poly16x8_t a, simde_poly16x8_t b, const int n) b_ = simde_poly16x8_to_private(b), r_ = a_; const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + const size_t len = sizeof(r_.values) / sizeof(r_.values[0]); + for (size_t i = 0 ; i < len ; i++) { size_t src = i + n_; - r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + r_.values[i] = (src < len) ? a_.values[src] : b_.values[src & 7]; } return simde_poly16x8_from_private(r_); #endif diff --git a/thirdparty/simde/arm/neon/fma.h b/thirdparty/simde/arm/neon/fma.h index ecf90d5b5..060e165e3 100644 --- a/thirdparty/simde/arm/neon/fma.h +++ b/thirdparty/simde/arm/neon/fma.h @@ -45,7 +45,8 @@ simde_vfmah_f16(simde_float16_t a, simde_float16_t b, simde_float16_t c) { return simde_vaddh_f16(a, simde_vmulh_f16(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmah_f16 #define vfmah_f16(a, b, c) simde_vfmah_f16(a, b, c) #endif @@ -68,7 +69,8 @@ simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { return simde_vadd_f32(a, simde_vmul_f32(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_f32 #define vfma_f32(a, b, c) simde_vfma_f32(a, b, c) #endif @@ -91,7 +93,8 @@ simde_vfma_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { return simde_vadd_f64(a, simde_vmul_f64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_f64 #define vfma_f64(a, b, c) simde_vfma_f64(a, b, c) #endif @@ -114,7 +117,8 @@ simde_vfma_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { return simde_vadd_f16(a, simde_vmul_f16(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfma_f16 #define vfma_f16(a, b, c) simde_vfma_f16(a, b, c) #endif @@ -137,7 +141,8 @@ simde_vfmaq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { return simde_vaddq_f16(a, simde_vmulq_f16(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmaq_f16 #define vfmaq_f16(a, b, c) simde_vfmaq_f16(a, b, c) #endif @@ -168,7 +173,8 @@ simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { return simde_vaddq_f32(a, simde_vmulq_f32(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_f32 #define vfmaq_f32(a, b, c) simde_vfmaq_f32(a, b, c) #endif @@ -199,7 +205,8 @@ simde_vfmaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { return simde_vaddq_f64(a, simde_vmulq_f64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_f64 #define vfmaq_f64(a, b, c) simde_vfmaq_f64(a, b, c) #endif diff --git a/thirdparty/simde/arm/neon/fma_lane.h b/thirdparty/simde/arm/neon/fma_lane.h index e937f715c..54ae4d6ef 100644 --- a/thirdparty/simde/arm/neon/fma_lane.h +++ b/thirdparty/simde/arm/neon/fma_lane.h @@ -56,7 +56,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmad_lane_f64 #define vfmad_lane_f64(a, b, v, lane) simde_vfmad_lane_f64(a, b, v, lane) #endif @@ -79,7 +80,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmad_laneq_f64 #define vfmad_laneq_f64(a, b, v, lane) simde_vfmad_laneq_f64(a, b, v, lane) #endif @@ -102,7 +104,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmah_lane_f16 #define vfmah_lane_f16(a, b, v, lane) simde_vfmah_lane_f16(a, b, v, lane) #endif @@ -125,7 +128,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmah_laneq_f16 #define vfmah_laneq_f16(a, b, v, lane) simde_vfmah_laneq_f16(a, b, v, lane) #endif @@ -148,7 +152,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmas_lane_f32 #define vfmas_lane_f32(a, b, v, lane) simde_vfmas_lane_f32(a, b, v, lane) #endif @@ -171,7 +176,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmas_laneq_f32 #define vfmas_laneq_f32(a, b, v, lane) simde_vfmas_laneq_f32(a, b, v, lane) #endif @@ -182,7 +188,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfma_lane_f16(a, b, v, lane) simde_vadd_f16(a, simde_vmul_lane_f16(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfma_lane_f16 #define vfma_lane_f16(a, b, v, lane) simde_vfma_lane_f16(a, b, v, lane) #endif @@ -193,7 +200,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfma_lane_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_lane_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_lane_f32 #define vfma_lane_f32(a, b, v, lane) simde_vfma_lane_f32(a, b, v, lane) #endif @@ -204,7 +212,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfma_lane_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_lane_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_lane_f64 #define vfma_lane_f64(a, b, v, lane) simde_vfma_lane_f64(a, b, v, lane) #endif @@ -215,7 +224,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfma_laneq_f16(a, b, v, lane) simde_vadd_f16(a, simde_vmul_laneq_f16(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfma_laneq_f16 #define vfma_laneq_f16(a, b, v, lane) simde_vfma_laneq_f16(a, b, v, lane) #endif @@ -226,7 +236,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfma_laneq_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_laneq_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_laneq_f32 #define vfma_laneq_f32(a, b, v, lane) simde_vfma_laneq_f32(a, b, v, lane) #endif @@ -237,7 +248,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfma_laneq_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_laneq_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfma_laneq_f64 #define vfma_laneq_f64(a, b, v, lane) simde_vfma_laneq_f64(a, b, v, lane) #endif @@ -248,7 +260,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfmaq_lane_f64(a, b, v, lane) simde_vaddq_f64(a, simde_vmulq_lane_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_lane_f64 #define vfmaq_lane_f64(a, b, v, lane) simde_vfmaq_lane_f64(a, b, v, lane) #endif @@ -259,7 +272,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfmaq_lane_f16(a, b, v, lane) simde_vaddq_f16(a, simde_vmulq_lane_f16(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmaq_lane_f16 #define vfmaq_lane_f16(a, b, v, lane) simde_vfmaq_lane_f16(a, b, v, lane) #endif @@ -270,7 +284,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfmaq_lane_f32(a, b, v, lane) simde_vaddq_f32(a, simde_vmulq_lane_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_lane_f32 #define vfmaq_lane_f32(a, b, v, lane) simde_vfmaq_lane_f32(a, b, v, lane) #endif @@ -282,7 +297,8 @@ SIMDE_BEGIN_DECLS_ #define simde_vfmaq_laneq_f16(a, b, v, lane) \ simde_vaddq_f16(a, simde_vmulq_laneq_f16(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmaq_laneq_f16 #define vfmaq_laneq_f16(a, b, v, lane) simde_vfmaq_laneq_f16(a, b, v, lane) #endif @@ -294,7 +310,8 @@ SIMDE_BEGIN_DECLS_ #define simde_vfmaq_laneq_f32(a, b, v, lane) \ simde_vaddq_f32(a, simde_vmulq_laneq_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_laneq_f32 #define vfmaq_laneq_f32(a, b, v, lane) simde_vfmaq_laneq_f32(a, b, v, lane) #endif @@ -306,7 +323,8 @@ SIMDE_BEGIN_DECLS_ #define simde_vfmaq_laneq_f64(a, b, v, lane) \ simde_vaddq_f64(a, simde_vmulq_laneq_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmaq_laneq_f64 #define vfmaq_laneq_f64(a, b, v, lane) simde_vfmaq_laneq_f64(a, b, v, lane) #endif diff --git a/thirdparty/simde/arm/neon/fma_n.h b/thirdparty/simde/arm/neon/fma_n.h index 0a23407c6..e9afae87c 100644 --- a/thirdparty/simde/arm/neon/fma_n.h +++ b/thirdparty/simde/arm/neon/fma_n.h @@ -45,7 +45,8 @@ simde_vfma_n_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16_t c) { return simde_vfma_f16(a, b, simde_vdup_n_f16(c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) #undef vfma_n_f16 #define vfma_n_f16(a, b, c) simde_vfma_n_f16(a, b, c) #endif @@ -59,7 +60,8 @@ simde_vfmaq_n_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16_t c) { return simde_vfmaq_f16(a, b, simde_vdupq_n_f16(c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmaq_n_f16 #define vfmaq_n_f16(a, b, c) simde_vfmaq_n_f16(a, b, c) #endif @@ -73,7 +75,8 @@ simde_vfma_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { return simde_vfma_f32(a, b, simde_vdup_n_f32(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399))) #undef vfma_n_f32 #define vfma_n_f32(a, b, c) simde_vfma_n_f32(a, b, c) #endif @@ -87,7 +90,8 @@ simde_vfma_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { return simde_vfma_f64(a, b, simde_vdup_n_f64(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vfma_n_f64 #define vfma_n_f64(a, b, c) simde_vfma_n_f64(a, b, c) #endif @@ -101,7 +105,8 @@ simde_vfmaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { return simde_vfmaq_f32(a, b, simde_vdupq_n_f32(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399))) #undef vfmaq_n_f32 #define vfmaq_n_f32(a, b, c) simde_vfmaq_n_f32(a, b, c) #endif @@ -115,7 +120,8 @@ simde_vfmaq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) { return simde_vfmaq_f64(a, b, simde_vdupq_n_f64(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vfmaq_n_f64 #define vfmaq_n_f64(a, b, c) simde_vfmaq_n_f64(a, b, c) #endif diff --git a/thirdparty/simde/arm/neon/fmlal.h b/thirdparty/simde/arm/neon/fmlal.h index f71d3019c..8fa297e5d 100644 --- a/thirdparty/simde/arm/neon/fmlal.h +++ b/thirdparty/simde/arm/neon/fmlal.h @@ -55,7 +55,9 @@ simde_vfmlal_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t return simde_float32x2_from_private(ret_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlal_low_f16 #define vfmlal_low_f16(r, a, b) simde_vfmlal_low_f16((r), (a), (b)) #endif @@ -82,7 +84,9 @@ simde_vfmlalq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_ return simde_float32x4_from_private(ret_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlalq_low_f16 #define vfmlalq_low_f16(r, a, b) simde_vfmlalq_low_f16((r), (a), (b)) #endif @@ -110,7 +114,9 @@ simde_vfmlal_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_ return simde_float32x2_from_private(ret_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlal_high_f16 #define vfmlal_high_f16(r, a, b) simde_vfmlal_high_f16((r), (a), (b)) #endif @@ -138,7 +144,9 @@ simde_vfmlalq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8 return simde_float32x4_from_private(ret_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlalq_high_f16 #define vfmlalq_high_f16(r, a, b) simde_vfmlalq_high_f16((r), (a), (b)) #endif @@ -163,11 +171,13 @@ simde_vfmlal_lane_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float1 } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlal_lane_low_f16(r, a, b, lane) vfmlal_lane_low_f16((r), (a), (b), (lane)); + #define simde_vfmlal_lane_low_f16(r, a, b, lane) vfmlal_lane_low_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlal_lane_low_f16 - #define vfmlal_lane_low_f16(r, a, b, lane) simde_vfmlal_lane_low_f16((r), (a), (b), (lane)); + #define vfmlal_lane_low_f16(r, a, b, lane) simde_vfmlal_lane_low_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -191,11 +201,13 @@ simde_vfmlal_laneq_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlal_laneq_low_f16(r, a, b, lane) vfmlal_laneq_low_f16((r), (a), (b), (lane)); + #define simde_vfmlal_laneq_low_f16(r, a, b, lane) vfmlal_laneq_low_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlal_laneq_low_f16 - #define vfmlal_laneq_low_f16(r, a, b, lane) simde_vfmlal_laneq_low_f16((r), (a), (b), (lane)); + #define vfmlal_laneq_low_f16(r, a, b, lane) simde_vfmlal_laneq_low_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -219,11 +231,13 @@ simde_vfmlalq_lane_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlalq_lane_low_f16(r, a, b, lane) vfmlalq_lane_low_f16((r), (a), (b), (lane)); + #define simde_vfmlalq_lane_low_f16(r, a, b, lane) vfmlalq_lane_low_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlalq_lane_low_f16 - #define vfmlalq_lane_low_f16(r, a, b, lane) simde_vfmlalq_lane_low_f16((r), (a), (b), (lane)); + #define vfmlalq_lane_low_f16(r, a, b, lane) simde_vfmlalq_lane_low_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -246,11 +260,13 @@ simde_vfmlalq_laneq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_floa } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlalq_laneq_low_f16(r, a, b, lane) vfmlalq_laneq_low_f16((r), (a), (b), (lane)); + #define simde_vfmlalq_laneq_low_f16(r, a, b, lane) vfmlalq_laneq_low_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlalq_laneq_low_f16 - #define vfmlalq_laneq_low_f16(r, a, b, lane) simde_vfmlalq_laneq_low_f16((r), (a), (b), (lane)); + #define vfmlalq_laneq_low_f16(r, a, b, lane) simde_vfmlalq_laneq_low_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -274,11 +290,13 @@ simde_vfmlal_lane_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlal_lane_high_f16(r, a, b, lane) vfmlal_lane_high_f16((r), (a), (b), (lane)); + #define simde_vfmlal_lane_high_f16(r, a, b, lane) vfmlal_lane_high_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlal_lane_high_f16 - #define vfmlal_lane_high_f16(r, a, b, lane) simde_vfmlal_lane_high_f16((r), (a), (b), (lane)); + #define vfmlal_lane_high_f16(r, a, b, lane) simde_vfmlal_lane_high_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -303,11 +321,13 @@ simde_vfmlal_laneq_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_floa } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlal_laneq_high_f16(r, a, b, lane) vfmlal_laneq_high_f16((r), (a), (b), (lane)); + #define simde_vfmlal_laneq_high_f16(r, a, b, lane) vfmlal_laneq_high_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlal_laneq_high_f16 - #define vfmlal_laneq_high_f16(r, a, b, lane) simde_vfmlal_laneq_high_f16((r), (a), (b), (lane)); + #define vfmlal_laneq_high_f16(r, a, b, lane) simde_vfmlal_laneq_high_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -332,11 +352,13 @@ simde_vfmlalq_lane_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_floa } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlalq_lane_high_f16(r, a, b, lane) vfmlalq_lane_high_f16((r), (a), (b), (lane)); + #define simde_vfmlalq_lane_high_f16(r, a, b, lane) vfmlalq_lane_high_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlalq_lane_high_f16 - #define vfmlalq_lane_high_f16(r, a, b, lane) simde_vfmlalq_lane_high_f16((r), (a), (b), (lane)); + #define vfmlalq_lane_high_f16(r, a, b, lane) simde_vfmlalq_lane_high_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -360,11 +382,13 @@ simde_vfmlalq_laneq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_flo } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlalq_laneq_high_f16(r, a, b, lane) vfmlalq_laneq_high_f16((r), (a), (b), (lane)); + #define simde_vfmlalq_laneq_high_f16(r, a, b, lane) vfmlalq_laneq_high_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlalq_laneq_high_f16 - #define vfmlalq_laneq_high_f16(r, a, b, lane) simde_vfmlalq_laneq_high_f16((r), (a), (b), (lane)); + #define vfmlalq_laneq_high_f16(r, a, b, lane) simde_vfmlalq_laneq_high_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -388,7 +412,8 @@ simde_vbfmlalbq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_ return simde_float32x4_from_private(ret); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vbfmlalbq_f32 #define vbfmlalbq_f32(r, a, b) simde_vbfmlalbq_f32((r), (a), (b)) #endif @@ -414,7 +439,8 @@ simde_vbfmlaltq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_ return simde_float32x4_from_private(ret); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vbfmlaltq_f32 #define vbfmlaltq_f32(r, a, b) simde_vbfmlaltq_f32((r), (a), (b)) #endif @@ -439,7 +465,8 @@ simde_vbfmlalbq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vbfmlalbq_lane_f32(r, a, b, lane) vbfmlalbq_lane_f32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vbfmlalbq_lane_f32 #define vbfmlalbq_lane_f32(r, a, b, lane) simde_vbfmlalbq_lane_f32((r), (a), (b), (lane)) #endif @@ -465,7 +492,8 @@ simde_vbfmlalbq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloa #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vbfmlalbq_laneq_f32(r, a, b, lane) vbfmlalbq_laneq_f32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vbfmlalbq_laneq_f32 #define vbfmlalbq_laneq_f32(r, a, b, lane) simde_vbfmlalbq_laneq_f32((r), (a), (b), (lane)) #endif @@ -490,7 +518,8 @@ simde_vbfmlaltq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vbfmlaltq_lane_f32(r, a, b, lane) vbfmlaltq_lane_f32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vbfmlaltq_lane_f32 #define vbfmlaltq_lane_f32(r, a, b, lane) simde_vbfmlaltq_lane_f32((r), (a), (b), (lane)) #endif @@ -516,7 +545,8 @@ simde_vbfmlaltq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloa #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vbfmlaltq_laneq_f32(r, a, b, lane) vbfmlaltq_laneq_f32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vbfmlaltq_laneq_f32 #define vbfmlaltq_laneq_f32(r, a, b, lane) simde_vbfmlaltq_laneq_f32((r), (a), (b), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/fmlsl.h b/thirdparty/simde/arm/neon/fmlsl.h index 8a5be5461..1517fafd9 100644 --- a/thirdparty/simde/arm/neon/fmlsl.h +++ b/thirdparty/simde/arm/neon/fmlsl.h @@ -55,7 +55,9 @@ simde_vfmlsl_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t return simde_float32x2_from_private(ret_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlsl_low_f16 #define vfmlsl_low_f16(r, a, b) simde_vfmlsl_low_f16((r), (a), (b)) #endif @@ -82,7 +84,9 @@ simde_vfmlslq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_ return simde_float32x4_from_private(ret_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlslq_low_f16 #define vfmlslq_low_f16(r, a, b) simde_vfmlslq_low_f16((r), (a), (b)) #endif @@ -110,7 +114,9 @@ simde_vfmlsl_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_ return simde_float32x2_from_private(ret_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlsl_high_f16 #define vfmlsl_high_f16(r, a, b) simde_vfmlsl_high_f16((r), (a), (b)) #endif @@ -138,7 +144,9 @@ simde_vfmlslq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8 return simde_float32x4_from_private(ret_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlslq_high_f16 #define vfmlslq_high_f16(r, a, b) simde_vfmlslq_high_f16((r), (a), (b)) #endif @@ -163,11 +171,13 @@ simde_vfmlsl_lane_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float1 } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlsl_lane_low_f16(r, a, b, lane) vfmlsl_lane_low_f16((r), (a), (b), (lane)); + #define simde_vfmlsl_lane_low_f16(r, a, b, lane) vfmlsl_lane_low_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlsl_lane_low_f16 - #define vfmlsl_lane_low_f16(r, a, b, lane) simde_vfmlsl_lane_low_f16((r), (a), (b), (lane)); + #define vfmlsl_lane_low_f16(r, a, b, lane) simde_vfmlsl_lane_low_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -191,11 +201,13 @@ simde_vfmlsl_laneq_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlsl_laneq_low_f16(r, a, b, lane) vfmlsl_laneq_low_f16((r), (a), (b), (lane)); + #define simde_vfmlsl_laneq_low_f16(r, a, b, lane) vfmlsl_laneq_low_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlsl_laneq_low_f16 - #define vfmlsl_laneq_low_f16(r, a, b, lane) simde_vfmlsl_laneq_low_f16((r), (a), (b), (lane)); + #define vfmlsl_laneq_low_f16(r, a, b, lane) simde_vfmlsl_laneq_low_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -219,11 +231,13 @@ simde_vfmlslq_lane_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlslq_lane_low_f16(r, a, b, lane) vfmlslq_lane_low_f16((r), (a), (b), (lane)); + #define simde_vfmlslq_lane_low_f16(r, a, b, lane) vfmlslq_lane_low_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlslq_lane_low_f16 - #define vfmlslq_lane_low_f16(r, a, b, lane) simde_vfmlslq_lane_low_f16((r), (a), (b), (lane)); + #define vfmlslq_lane_low_f16(r, a, b, lane) simde_vfmlslq_lane_low_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -246,11 +260,13 @@ simde_vfmlslq_laneq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_floa } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlslq_laneq_low_f16(r, a, b, lane) vfmlslq_laneq_low_f16((r), (a), (b), (lane)); + #define simde_vfmlslq_laneq_low_f16(r, a, b, lane) vfmlslq_laneq_low_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlslq_laneq_low_f16 - #define vfmlslq_laneq_low_f16(r, a, b, lane) simde_vfmlslq_laneq_low_f16((r), (a), (b), (lane)); + #define vfmlslq_laneq_low_f16(r, a, b, lane) simde_vfmlslq_laneq_low_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -274,11 +290,13 @@ simde_vfmlsl_lane_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlsl_lane_high_f16(r, a, b, lane) vfmlsl_lane_high_f16((r), (a), (b), (lane)); + #define simde_vfmlsl_lane_high_f16(r, a, b, lane) vfmlsl_lane_high_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlsl_lane_high_f16 - #define vfmlsl_lane_high_f16(r, a, b, lane) simde_vfmlsl_lane_high_f16((r), (a), (b), (lane)); + #define vfmlsl_lane_high_f16(r, a, b, lane) simde_vfmlsl_lane_high_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -303,11 +321,13 @@ simde_vfmlsl_laneq_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_floa } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlsl_laneq_high_f16(r, a, b, lane) vfmlsl_laneq_high_f16((r), (a), (b), (lane)); + #define simde_vfmlsl_laneq_high_f16(r, a, b, lane) vfmlsl_laneq_high_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlsl_laneq_high_f16 - #define vfmlsl_laneq_high_f16(r, a, b, lane) simde_vfmlsl_laneq_high_f16((r), (a), (b), (lane)); + #define vfmlsl_laneq_high_f16(r, a, b, lane) simde_vfmlsl_laneq_high_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -332,11 +352,13 @@ simde_vfmlslq_lane_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_floa } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlslq_lane_high_f16(r, a, b, lane) vfmlslq_lane_high_f16((r), (a), (b), (lane)); + #define simde_vfmlslq_lane_high_f16(r, a, b, lane) vfmlslq_lane_high_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlslq_lane_high_f16 - #define vfmlslq_lane_high_f16(r, a, b, lane) simde_vfmlslq_lane_high_f16((r), (a), (b), (lane)); + #define vfmlslq_lane_high_f16(r, a, b, lane) simde_vfmlslq_lane_high_f16((r), (a), (b), (lane)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -360,11 +382,13 @@ simde_vfmlslq_laneq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_flo } #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ defined(SIMDE_ARCH_ARM_FP16_FML) - #define simde_vfmlslq_laneq_high_f16(r, a, b, lane) vfmlslq_laneq_high_f16((r), (a), (b), (lane)); + #define simde_vfmlslq_laneq_high_f16(r, a, b, lane) vfmlslq_laneq_high_f16((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML))) #undef vfmlslq_laneq_high_f16 - #define vfmlslq_laneq_high_f16(r, a, b, lane) simde_vfmlslq_laneq_high_f16((r), (a), (b), (lane)); + #define vfmlslq_laneq_high_f16(r, a, b, lane) simde_vfmlslq_laneq_high_f16((r), (a), (b), (lane)) #endif SIMDE_END_DECLS_ diff --git a/thirdparty/simde/arm/neon/fms.h b/thirdparty/simde/arm/neon/fms.h index 21823f2c0..b1eb99c7d 100644 --- a/thirdparty/simde/arm/neon/fms.h +++ b/thirdparty/simde/arm/neon/fms.h @@ -45,7 +45,8 @@ simde_vfmsh_f16(simde_float16_t a, simde_float16_t b, simde_float16_t c) { return simde_vaddh_f16(a, simde_vnegh_f16(simde_vmulh_f16(b, c))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmsh_f16 #define vfmsh_f16(a, b, c) simde_vfmsh_f16(a, b, c) #endif @@ -67,7 +68,8 @@ simde_vfms_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { return simde_vadd_f32(a, simde_vneg_f32(simde_vmul_f32(b, c))); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfms_f32 #define vfms_f32(a, b, c) simde_vfms_f32(a, b, c) #endif @@ -89,7 +91,8 @@ simde_vfms_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { return simde_vadd_f64(a, simde_vneg_f64(simde_vmul_f64(b, c))); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfms_f64 #define vfms_f64(a, b, c) simde_vfms_f64(a, b, c) #endif @@ -111,7 +114,8 @@ simde_vfms_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { return simde_vadd_f16(a, simde_vneg_f16(simde_vmul_f16(b, c))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfms_f16 #define vfms_f16(a, b, c) simde_vfms_f16(a, b, c) #endif @@ -133,7 +137,8 @@ simde_vfmsq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { return simde_vaddq_f16(a, simde_vnegq_f16(simde_vmulq_f16(b, c))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmsq_f16 #define vfmsq_f16(a, b, c) simde_vfmsq_f16(a, b, c) #endif @@ -155,7 +160,8 @@ simde_vfmsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { return simde_vaddq_f32(a, simde_vnegq_f32(simde_vmulq_f32(b, c))); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmsq_f32 #define vfmsq_f32(a, b, c) simde_vfmsq_f32(a, b, c) #endif @@ -177,7 +183,8 @@ simde_vfmsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { return simde_vaddq_f64(a, simde_vnegq_f64(simde_vmulq_f64(b, c))); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmsq_f64 #define vfmsq_f64(a, b, c) simde_vfmsq_f64(a, b, c) #endif diff --git a/thirdparty/simde/arm/neon/fms_lane.h b/thirdparty/simde/arm/neon/fms_lane.h index 05ef96ae3..d0f9f86c2 100644 --- a/thirdparty/simde/arm/neon/fms_lane.h +++ b/thirdparty/simde/arm/neon/fms_lane.h @@ -55,7 +55,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmsd_lane_f64 #define vfmsd_lane_f64(a, b, v, lane) simde_vfmsd_lane_f64(a, b, v, lane) #endif @@ -78,7 +79,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmsd_laneq_f64 #define vfmsd_laneq_f64(a, b, v, lane) simde_vfmsd_laneq_f64(a, b, v, lane) #endif @@ -101,7 +103,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmsh_lane_f16 #define vfmsh_lane_f16(a, b, v, lane) simde_vfmsh_lane_f16(a, b, v, lane) #endif @@ -124,7 +127,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmsh_laneq_f16 #define vfmsh_laneq_f16(a, b, v, lane) simde_vfmsh_laneq_f16(a, b, v, lane) #endif @@ -147,7 +151,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmss_lane_f32 #define vfmss_lane_f32(a, b, v, lane) simde_vfmss_lane_f32(a, b, v, lane) #endif @@ -170,7 +175,8 @@ SIMDE_BEGIN_DECLS_ 0 \ ) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmss_laneq_f32 #define vfmss_laneq_f32(a, b, v, lane) simde_vfmss_laneq_f32(a, b, v, lane) #endif @@ -181,7 +187,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfms_lane_f16(a, b, v, lane) simde_vsub_f16(a, simde_vmul_lane_f16(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfms_lane_f16 #define vfms_lane_f16(a, b, v, lane) simde_vfms_lane_f16(a, b, v, lane) #endif @@ -192,7 +199,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfms_lane_f32(a, b, v, lane) simde_vsub_f32(a, simde_vmul_lane_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfms_lane_f32 #define vfms_lane_f32(a, b, v, lane) simde_vfms_lane_f32(a, b, v, lane) #endif @@ -203,7 +211,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfms_lane_f64(a, b, v, lane) simde_vsub_f64(a, simde_vmul_lane_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfms_lane_f64 #define vfms_lane_f64(a, b, v, lane) simde_vfms_lane_f64(a, b, v, lane) #endif @@ -214,7 +223,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfms_laneq_f16(a, b, v, lane) simde_vsub_f16(a, simde_vmul_laneq_f16(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfms_laneq_f16 #define vfms_laneq_f16(a, b, v, lane) simde_vfms_laneq_f16(a, b, v, lane) #endif @@ -225,7 +235,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfms_laneq_f32(a, b, v, lane) simde_vsub_f32(a, simde_vmul_laneq_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfms_laneq_f32 #define vfms_laneq_f32(a, b, v, lane) simde_vfms_laneq_f32(a, b, v, lane) #endif @@ -236,7 +247,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfms_laneq_f64(a, b, v, lane) simde_vsub_f64(a, simde_vmul_laneq_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfms_laneq_f64 #define vfms_laneq_f64(a, b, v, lane) simde_vfms_laneq_f64(a, b, v, lane) #endif @@ -247,7 +259,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfmsq_lane_f64(a, b, v, lane) simde_vsubq_f64(a, simde_vmulq_lane_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmsq_lane_f64 #define vfmsq_lane_f64(a, b, v, lane) simde_vfmsq_lane_f64(a, b, v, lane) #endif @@ -258,7 +271,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfmsq_lane_f16(a, b, v, lane) simde_vsubq_f16(a, simde_vmulq_lane_f16(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmsq_lane_f16 #define vfmsq_lane_f16(a, b, v, lane) simde_vfmsq_lane_f16(a, b, v, lane) #endif @@ -269,7 +283,8 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vfmsq_lane_f32(a, b, v, lane) simde_vsubq_f32(a, simde_vmulq_lane_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmsq_lane_f32 #define vfmsq_lane_f32(a, b, v, lane) simde_vfmsq_lane_f32(a, b, v, lane) #endif @@ -281,7 +296,8 @@ SIMDE_BEGIN_DECLS_ #define simde_vfmsq_laneq_f16(a, b, v, lane) \ simde_vsubq_f16(a, simde_vmulq_laneq_f16(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmsq_laneq_f16 #define vfmsq_laneq_f16(a, b, v, lane) simde_vfmsq_laneq_f16(a, b, v, lane) #endif @@ -293,7 +309,8 @@ SIMDE_BEGIN_DECLS_ #define simde_vfmsq_laneq_f32(a, b, v, lane) \ simde_vsubq_f32(a, simde_vmulq_laneq_f32(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmsq_laneq_f32 #define vfmsq_laneq_f32(a, b, v, lane) simde_vfmsq_laneq_f32(a, b, v, lane) #endif @@ -305,7 +322,8 @@ SIMDE_BEGIN_DECLS_ #define simde_vfmsq_laneq_f64(a, b, v, lane) \ simde_vsubq_f64(a, simde_vmulq_laneq_f64(b, v, lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA))) #undef vfmsq_laneq_f64 #define vfmsq_laneq_f64(a, b, v, lane) simde_vfmsq_laneq_f64(a, b, v, lane) #endif diff --git a/thirdparty/simde/arm/neon/fms_n.h b/thirdparty/simde/arm/neon/fms_n.h index 6783988a2..bf2663d20 100644 --- a/thirdparty/simde/arm/neon/fms_n.h +++ b/thirdparty/simde/arm/neon/fms_n.h @@ -52,7 +52,8 @@ simde_vfms_n_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16_t c) { return simde_vfms_f16(a, b, simde_vdup_n_f16(c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) #undef vfms_n_f16 #define vfms_n_f16(a, b, c) simde_vfms_n_f16(a, b, c) #endif @@ -73,7 +74,8 @@ simde_vfmsq_n_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16_t c) { return simde_vfmsq_f16(a, b, simde_vdupq_n_f16(c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) #undef vfmsq_n_f16 #define vfmsq_n_f16(a, b, c) simde_vfmsq_n_f16(a, b, c) #endif @@ -94,7 +96,8 @@ simde_vfms_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { return simde_vfms_f32(a, b, simde_vdup_n_f32(c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399))) #undef vfms_n_f32 #define vfms_n_f32(a, b, c) simde_vfms_n_f32(a, b, c) #endif @@ -115,7 +118,8 @@ simde_vfms_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { return simde_vfms_f64(a, b, simde_vdup_n_f64(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vfms_n_f64 #define vfms_n_f64(a, b, c) simde_vfms_n_f64(a, b, c) #endif @@ -136,7 +140,8 @@ simde_vfmsq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { return simde_vfmsq_f32(a, b, simde_vdupq_n_f32(c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399))) #undef vfmsq_n_f32 #define vfmsq_n_f32(a, b, c) simde_vfmsq_n_f32(a, b, c) #endif @@ -157,7 +162,8 @@ simde_vfmsq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) { return simde_vfmsq_f64(a, b, simde_vdupq_n_f64(c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vfmsq_n_f64 #define vfmsq_n_f64(a, b, c) simde_vfmsq_n_f64(a, b, c) #endif diff --git a/thirdparty/simde/arm/neon/get_high.h b/thirdparty/simde/arm/neon/get_high.h index 899dc3f45..ff8f537ef 100644 --- a/thirdparty/simde/arm/neon/get_high.h +++ b/thirdparty/simde/arm/neon/get_high.h @@ -55,7 +55,8 @@ simde_vget_high_f16(simde_float16x8_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vget_high_f16 #define vget_high_f16(a) simde_vget_high_f16((a)) #endif diff --git a/thirdparty/simde/arm/neon/get_lane.h b/thirdparty/simde/arm/neon/get_lane.h index 06040eb2c..d19dd9847 100644 --- a/thirdparty/simde/arm/neon/get_lane.h +++ b/thirdparty/simde/arm/neon/get_lane.h @@ -51,7 +51,8 @@ simde_vget_lane_f16(simde_float16x4_t v, const int lane) return r; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vget_lane_f16 #define vget_lane_f16(v, lane) simde_vget_lane_f16((v), (lane)) #endif @@ -285,7 +286,8 @@ simde_vgetq_lane_f16(simde_float16x8_t v, const int lane) return r; } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vgetq_lane_f16 #define vgetq_lane_f16(v, lane) simde_vgetq_lane_f16((v), (lane)) #endif @@ -569,7 +571,8 @@ simde_vget_lane_p8(simde_poly8x8_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vget_lane_p8(v, lane) vget_lane_p8((v), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vget_lane_p8 #define vget_lane_p8(v, lane) simde_vget_lane_p8((v), (lane)) #endif @@ -588,7 +591,8 @@ simde_vget_lane_p16(simde_poly16x4_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vget_lane_p16(v, lane) vget_lane_p16((v), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vget_lane_p16 #define vget_lane_p16(v, lane) simde_vget_lane_p16((v), (lane)) #endif @@ -607,7 +611,8 @@ simde_vget_lane_p64(simde_poly64x1_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vget_lane_p64(v, lane) vget_lane_p64((v), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vget_lane_p64 #define vget_lane_p64(v, lane) simde_vget_lane_p64((v), (lane)) #endif @@ -626,7 +631,8 @@ simde_vgetq_lane_p8(simde_poly8x16_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vgetq_lane_p8(v, lane) vgetq_lane_p8((v), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vgetq_lane_p8 #define vgetq_lane_p8(v, lane) simde_vgetq_lane_p8((v), (lane)) #endif @@ -645,7 +651,8 @@ simde_vgetq_lane_p16(simde_poly16x8_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vgetq_lane_p16(v, lane) vgetq_lane_p16((v), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vgetq_lane_p16 #define vgetq_lane_p16(v, lane) simde_vgetq_lane_p16((v), (lane)) #endif @@ -664,7 +671,8 @@ simde_vgetq_lane_p64(simde_poly64x2_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vgetq_lane_p64(v, lane) vgetq_lane_p64((v), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vgetq_lane_p64 #define vgetq_lane_p64(v, lane) simde_vgetq_lane_p64((v), (lane)) #endif @@ -685,7 +693,8 @@ simde_vget_lane_bf16(simde_bfloat16x4_t v, const int lane) return r; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vget_lane_bf16 #define vget_lane_bf16(v, lane) simde_vget_lane_bf16((v), (lane)) #endif @@ -706,7 +715,8 @@ simde_vgetq_lane_bf16(simde_bfloat16x8_t v, const int lane) return r; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vgetq_lane_bf16 #define vgetq_lane_bf16(v, lane) simde_vgetq_lane_bf16((v), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/get_low.h b/thirdparty/simde/arm/neon/get_low.h index 99180cb72..36fae5890 100644 --- a/thirdparty/simde/arm/neon/get_low.h +++ b/thirdparty/simde/arm/neon/get_low.h @@ -57,7 +57,8 @@ simde_vget_low_f16(simde_float16x8_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vget_low_f16 #define vget_low_f16(a) simde_vget_low_f16((a)) #endif @@ -457,7 +458,8 @@ simde_vget_low_bf16(simde_bfloat16x8_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vget_low_bf16 #define vget_low_bf16(a) simde_vget_low_bf16((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld1.h b/thirdparty/simde/arm/neon/ld1.h index 5dd2d17c6..2f59b6902 100644 --- a/thirdparty/simde/arm/neon/ld1.h +++ b/thirdparty/simde/arm/neon/ld1.h @@ -51,7 +51,7 @@ simde_vld1_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vld1_f16 #define vld1_f16(a) simde_vld1_f16((a)) #endif @@ -267,13 +267,15 @@ simde_vld1q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) r_.sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + #elif defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + r_.m128h = _mm_loadu_ph(SIMDE_ALIGN_CAST(__m128h const *, ptr)); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vld1q_f16 #define vld1q_f16(a) simde_vld1q_f16((a)) #endif @@ -289,6 +291,8 @@ simde_vld1q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle32_v_f32m1(ptr , 4); + #elif defined(SIMDE_X86_SSE_NATIVE) + r_.m128 = _mm_loadu_ps(ptr); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -311,6 +315,8 @@ simde_vld1q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle64_v_f64m1(ptr , 2); + #elif defined(SIMDE_X86_SSE2_NATIVE) + r_.m128d = _mm_loadu_pd(ptr); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -333,6 +339,10 @@ simde_vld1q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle8_v_i8m1(ptr , 16); + #elif defined(SIMDE_X86_SSE3_NATIVE) + r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); + #elif defined(SIMDE_X86_SSE2_NATIVE) + r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -355,6 +365,10 @@ simde_vld1q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle16_v_i16m1(ptr , 8); + #elif defined(SIMDE_X86_SSE3_NATIVE) + r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); + #elif defined(SIMDE_X86_SSE2_NATIVE) + r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -377,6 +391,10 @@ simde_vld1q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle32_v_i32m1(ptr , 4); + #elif defined(SIMDE_X86_SSE3_NATIVE) + r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); + #elif defined(SIMDE_X86_SSE2_NATIVE) + r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -399,6 +417,10 @@ simde_vld1q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle64_v_i64m1(ptr , 2); + #elif defined(SIMDE_X86_SSE3_NATIVE) + r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); + #elif defined(SIMDE_X86_SSE2_NATIVE) + r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -421,6 +443,10 @@ simde_vld1q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16); + #elif defined(SIMDE_X86_SSE3_NATIVE) + r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); + #elif defined(SIMDE_X86_SSE2_NATIVE) + r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -443,6 +469,10 @@ simde_vld1q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8); + #elif defined(SIMDE_X86_SSE3_NATIVE) + r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); + #elif defined(SIMDE_X86_SSE2_NATIVE) + r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -465,6 +495,10 @@ simde_vld1q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle32_v_u32m1(ptr , 4); + #elif defined(SIMDE_X86_SSE3_NATIVE) + r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); + #elif defined(SIMDE_X86_SSE2_NATIVE) + r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -487,6 +521,10 @@ simde_vld1q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { r_.v128 = wasm_v128_load(ptr); #elif defined(SIMDE_RISCV_V_NATIVE) r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2); + #elif defined(SIMDE_X86_SSE3_NATIVE) + r_.m128i = _mm_lddqu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); + #elif defined(SIMDE_X86_SSE2_NATIVE) + r_.m128i = _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, ptr)); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -622,7 +660,7 @@ simde_vld1q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vldrq_p128(simde_poly128_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vldrq_p128(ptr); #else simde_poly128_t r_; @@ -648,7 +686,7 @@ simde_vld1_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_BF16)) #undef vld1_bf16 #define vld1_bf16(a) simde_vld1_bf16((a)) #endif @@ -664,7 +702,7 @@ simde_vld1q_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_BF16)) #undef vld1q_bf16 #define vld1q_bf16(a) simde_vld1q_bf16((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld1_dup.h b/thirdparty/simde/arm/neon/ld1_dup.h index cc15cf982..e9cf43239 100644 --- a/thirdparty/simde/arm/neon/ld1_dup.h +++ b/thirdparty/simde/arm/neon/ld1_dup.h @@ -45,7 +45,7 @@ simde_vld1_dup_f16(simde_float16_t const * ptr) { return simde_vdup_n_f16(*ptr); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vld1_dup_f16 #define vld1_dup_f16(a) simde_vld1_dup_f16((a)) #endif @@ -201,7 +201,7 @@ simde_vld1q_dup_f16(simde_float16_t const * ptr) { return simde_vdupq_n_f16(*ptr); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vld1q_dup_f16 #define vld1q_dup_f16(a) simde_vld1q_dup_f16((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld1_lane.h b/thirdparty/simde/arm/neon/ld1_lane.h index 5818ead64..961a67209 100644 --- a/thirdparty/simde/arm/neon/ld1_lane.h +++ b/thirdparty/simde/arm/neon/ld1_lane.h @@ -173,7 +173,7 @@ simde_float16x4_t simde_vld1_lane_f16(simde_float16_t const *ptr, simde_float16x #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vld1_lane_f16(ptr, src, lane) vld1_lane_f16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vld1_lane_f16 #define vld1_lane_f16(ptr, src, lane) simde_vld1_lane_f16((ptr), (src), (lane)) #endif @@ -349,7 +349,7 @@ simde_float16x8_t simde_vld1q_lane_f16(simde_float16_t const *ptr, simde_float16 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vld1q_lane_f16(ptr, src, lane) vld1q_lane_f16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vld1q_lane_f16 #define vld1q_lane_f16(ptr, src, lane) simde_vld1q_lane_f16((ptr), (src), (lane)) #endif @@ -498,7 +498,8 @@ simde_bfloat16x4_t simde_vld1_lane_bf16(simde_bfloat16_t const *ptr, simde_bfloa #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vld1_lane_bf16(ptr, src, lane) vld1_lane_bf16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld1_lane_bf16 #define vld1_lane_bf16(ptr, src, lane) simde_vld1_lane_bf16((ptr), (src), (lane)) #endif @@ -514,7 +515,8 @@ simde_bfloat16x8_t simde_vld1q_lane_bf16(simde_bfloat16_t const *ptr, simde_bflo #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vld1q_lane_bf16(ptr, src, lane) vld1q_lane_bf16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld1q_lane_bf16 #define vld1q_lane_bf16(ptr, src, lane) simde_vld1q_lane_bf16((ptr), (src), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/ld1_x2.h b/thirdparty/simde/arm/neon/ld1_x2.h index 75ce61d10..c502debb4 100644 --- a/thirdparty/simde/arm/neon/ld1_x2.h +++ b/thirdparty/simde/arm/neon/ld1_x2.h @@ -36,7 +36,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -65,7 +65,10 @@ simde_vld1_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_f16_x2 #define vld1_f16_x2(a) simde_vld1_f16_x2((a)) #endif @@ -93,7 +96,9 @@ simde_vld1_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_f32_x2 #define vld1_f32_x2(a) simde_vld1_f32_x2((a)) #endif @@ -121,7 +126,9 @@ simde_vld1_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vld1_f64_x2 #define vld1_f64_x2(a) simde_vld1_f64_x2((a)) #endif @@ -149,7 +156,9 @@ simde_vld1_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s8_x2 #define vld1_s8_x2(a) simde_vld1_s8_x2((a)) #endif @@ -177,7 +186,9 @@ simde_vld1_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s16_x2 #define vld1_s16_x2(a) simde_vld1_s16_x2((a)) #endif @@ -205,7 +216,9 @@ simde_vld1_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s32_x2 #define vld1_s32_x2(a) simde_vld1_s32_x2((a)) #endif @@ -233,7 +246,9 @@ simde_vld1_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s64_x2 #define vld1_s64_x2(a) simde_vld1_s64_x2((a)) #endif @@ -261,7 +276,9 @@ simde_vld1_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u8_x2 #define vld1_u8_x2(a) simde_vld1_u8_x2((a)) #endif @@ -289,7 +306,9 @@ simde_vld1_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u16_x2 #define vld1_u16_x2(a) simde_vld1_u16_x2((a)) #endif @@ -317,7 +336,9 @@ simde_vld1_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u32_x2 #define vld1_u32_x2(a) simde_vld1_u32_x2((a)) #endif @@ -345,7 +366,9 @@ simde_vld1_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u64_x2 #define vld1_u64_x2(a) simde_vld1_u64_x2((a)) #endif @@ -370,7 +393,8 @@ simde_vld1_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) #undef vld1_p8_x2 #define vld1_p8_x2(a) simde_vld1_p8_x2((a)) #endif @@ -395,7 +419,8 @@ simde_vld1_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) #undef vld1_p16_x2 #define vld1_p16_x2(a) simde_vld1_p16_x2((a)) #endif @@ -409,6 +434,10 @@ simde_vld1_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1_p64_x2(ptr); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_poly64x1_private a_[2]; #if defined(SIMDE_RISCV_V_NATIVE) a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); @@ -418,12 +447,17 @@ simde_vld1_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { a_[i].values[0] = ptr[i]; } #endif + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_poly64x1x2_t s_ = { { simde_poly64x1_from_private(a_[0]), simde_poly64x1_from_private(a_[1]) } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_p64_x2 #define vld1_p64_x2(a) simde_vld1_p64_x2((a)) #endif @@ -443,7 +477,8 @@ simde_vld1_bf16_x2(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld1_bf16_x2 #define vld1_bf16_x2(a) simde_vld1_bf16_x2((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld1_x3.h b/thirdparty/simde/arm/neon/ld1_x3.h index bdaf8e527..a34ce54e7 100644 --- a/thirdparty/simde/arm/neon/ld1_x3.h +++ b/thirdparty/simde/arm/neon/ld1_x3.h @@ -35,7 +35,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -66,7 +66,10 @@ simde_vld1_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_f16_x3 #define vld1_f16_x3(a) simde_vld1_f16_x3((a)) #endif @@ -96,7 +99,9 @@ simde_vld1_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(6)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_f32_x3 #define vld1_f32_x3(a) simde_vld1_f32_x3((a)) #endif @@ -126,7 +131,9 @@ simde_vld1_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(3)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vld1_f64_x3 #define vld1_f64_x3(a) simde_vld1_f64_x3((a)) #endif @@ -156,7 +163,9 @@ simde_vld1_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s8_x3 #define vld1_s8_x3(a) simde_vld1_s8_x3((a)) #endif @@ -186,7 +195,9 @@ simde_vld1_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s16_x3 #define vld1_s16_x3(a) simde_vld1_s16_x3((a)) #endif @@ -216,7 +227,9 @@ simde_vld1_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s32_x3 #define vld1_s32_x3(a) simde_vld1_s32_x3((a)) #endif @@ -246,7 +259,9 @@ simde_vld1_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s64_x3 #define vld1_s64_x3(a) simde_vld1_s64_x3((a)) #endif @@ -276,7 +291,9 @@ simde_vld1_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u8_x3 #define vld1_u8_x3(a) simde_vld1_u8_x3((a)) #endif @@ -306,7 +323,9 @@ simde_vld1_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u16_x3 #define vld1_u16_x3(a) simde_vld1_u16_x3((a)) #endif @@ -336,7 +355,9 @@ simde_vld1_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u32_x3 #define vld1_u32_x3(a) simde_vld1_u32_x3((a)) #endif @@ -366,7 +387,9 @@ simde_vld1_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u64_x3 #define vld1_u64_x3(a) simde_vld1_u64_x3((a)) #endif @@ -394,7 +417,8 @@ simde_vld1_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vld1_p8_x3 #define vld1_p8_x3(a) simde_vld1_p8_x3((a)) #endif @@ -422,7 +446,8 @@ simde_vld1_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vld1_p16_x3 #define vld1_p16_x3(a) simde_vld1_p16_x3((a)) #endif @@ -452,7 +477,9 @@ simde_vld1_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_p64_x3 #define vld1_p64_x3(a) simde_vld1_p64_x3((a)) #endif @@ -473,7 +500,8 @@ simde_vld1_bf16_x3(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(12)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld1_bf16_x3 #define vld1_bf16_x3(a) simde_vld1_bf16_x3((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld1_x4.h b/thirdparty/simde/arm/neon/ld1_x4.h index 1d797364b..bb72da4ba 100644 --- a/thirdparty/simde/arm/neon/ld1_x4.h +++ b/thirdparty/simde/arm/neon/ld1_x4.h @@ -36,7 +36,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -69,7 +69,10 @@ simde_vld1_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_f16_x4 #define vld1_f16_x4(a) simde_vld1_f16_x4((a)) #endif @@ -101,7 +104,9 @@ simde_vld1_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_f32_x4 #define vld1_f32_x4(a) simde_vld1_f32_x4((a)) #endif @@ -133,7 +138,9 @@ simde_vld1_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vld1_f64_x4 #define vld1_f64_x4(a) simde_vld1_f64_x4((a)) #endif @@ -165,7 +172,9 @@ simde_vld1_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s8_x4 #define vld1_s8_x4(a) simde_vld1_s8_x4((a)) #endif @@ -197,7 +206,9 @@ simde_vld1_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s16_x4 #define vld1_s16_x4(a) simde_vld1_s16_x4((a)) #endif @@ -229,7 +240,9 @@ simde_vld1_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s32_x4 #define vld1_s32_x4(a) simde_vld1_s32_x4((a)) #endif @@ -261,7 +274,9 @@ simde_vld1_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_s64_x4 #define vld1_s64_x4(a) simde_vld1_s64_x4((a)) #endif @@ -293,7 +308,9 @@ simde_vld1_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u8_x4 #define vld1_u8_x4(a) simde_vld1_u8_x4((a)) #endif @@ -325,7 +342,9 @@ simde_vld1_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u16_x4 #define vld1_u16_x4(a) simde_vld1_u16_x4((a)) #endif @@ -357,7 +376,9 @@ simde_vld1_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u32_x4 #define vld1_u32_x4(a) simde_vld1_u32_x4((a)) #endif @@ -389,7 +410,9 @@ simde_vld1_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_u64_x4 #define vld1_u64_x4(a) simde_vld1_u64_x4((a)) #endif @@ -419,7 +442,9 @@ simde_vld1_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_p8_x4 #define vld1_p8_x4(a) simde_vld1_p8_x4((a)) #endif @@ -449,7 +474,8 @@ simde_vld1_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vld1_p16_x4 #define vld1_p16_x4(a) simde_vld1_p16_x4((a)) #endif @@ -481,7 +507,9 @@ simde_vld1_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1_p64_x4 #define vld1_p64_x4(a) simde_vld1_p64_x4((a)) #endif @@ -503,7 +531,8 @@ simde_vld1_bf16_x4(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld1_bf16_x4 #define vld1_bf16_x4(a) simde_vld1_bf16_x4((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld1q_x2.h b/thirdparty/simde/arm/neon/ld1q_x2.h index da1da866a..df6452d23 100644 --- a/thirdparty/simde/arm/neon/ld1q_x2.h +++ b/thirdparty/simde/arm/neon/ld1q_x2.h @@ -26,17 +26,19 @@ * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_ARM_NEON_LD1Q_X2_H) #define SIMDE_ARM_NEON_LD1Q_X2_H #include "types.h" +#include "ld1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -48,25 +50,20 @@ simde_vld1q_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #if \ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ - (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ - defined(SIMDE_ARM_NEON_FP16) + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_f16_x2(ptr); #else - simde_float16x8_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); - a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); - #else - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_float16x8x2_t s_ = { { simde_float16x8_from_private(a_[0]), - simde_float16x8_from_private(a_[1]) } }; + simde_float16x8_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_f16(ptr + 8*i); + } + simde_float16x8x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_f16_x2 #define vld1q_f16_x2(a) simde_vld1q_f16_x2((a)) #endif @@ -80,21 +77,17 @@ simde_vld1q_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_f32_x2(ptr); #else - simde_float32x4_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); - a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); - #else - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } - #endif - simde_float32x4x2_t s_ = { { simde_float32x4_from_private(a_[0]), - simde_float32x4_from_private(a_[1]) } }; + simde_float32x4_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_f32(ptr + 4*i); + } + simde_float32x4x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_f32_x2 #define vld1q_f32_x2(a) simde_vld1q_f32_x2((a)) #endif @@ -108,21 +101,17 @@ simde_vld1q_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vld1q_f64_x2(ptr); #else - simde_float64x2_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); - #else - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_float64x2x2_t s_ = { { simde_float64x2_from_private(a_[0]), - simde_float64x2_from_private(a_[1]) } }; + simde_float64x2_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_f64(ptr + 2*i); + } + simde_float64x2x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vld1q_f64_x2 #define vld1q_f64_x2(a) simde_vld1q_f64_x2((a)) #endif @@ -136,21 +125,17 @@ simde_vld1q_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s8_x2(ptr); #else - simde_int8x16_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); - a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); - #else - for (size_t i = 0; i < 32; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - #endif - simde_int8x16x2_t s_ = { { simde_int8x16_from_private(a_[0]), - simde_int8x16_from_private(a_[1]) } }; + simde_int8x16_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_s8(ptr + 16*i); + } + simde_int8x16x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s8_x2 #define vld1q_s8_x2(a) simde_vld1q_s8_x2((a)) #endif @@ -164,21 +149,17 @@ simde_vld1q_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s16_x2(ptr); #else - simde_int16x8_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); - a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); - #else - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_int16x8x2_t s_ = { { simde_int16x8_from_private(a_[0]), - simde_int16x8_from_private(a_[1]) } }; + simde_int16x8_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_s16(ptr + 8*i); + } + simde_int16x8x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s16_x2 #define vld1q_s16_x2(a) simde_vld1q_s16_x2((a)) #endif @@ -192,21 +173,17 @@ simde_vld1q_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s32_x2(ptr); #else - simde_int32x4_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); - a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); - #else - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } - #endif - simde_int32x4x2_t s_ = { { simde_int32x4_from_private(a_[0]), - simde_int32x4_from_private(a_[1]) } }; + simde_int32x4_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_s32(ptr + 4*i); + } + simde_int32x4x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s32_x2 #define vld1q_s32_x2(a) simde_vld1q_s32_x2((a)) #endif @@ -220,21 +197,17 @@ simde_vld1q_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s64_x2(ptr); #else - simde_int64x2_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); - #else - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_int64x2x2_t s_ = { { simde_int64x2_from_private(a_[0]), - simde_int64x2_from_private(a_[1]) } }; + simde_int64x2_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_s64(ptr + 2*i); + } + simde_int64x2x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s64_x2 #define vld1q_s64_x2(a) simde_vld1q_s64_x2((a)) #endif @@ -248,21 +221,17 @@ simde_vld1q_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u8_x2(ptr); #else - simde_uint8x16_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); - a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); - #else - for (size_t i = 0; i < 32; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - #endif - simde_uint8x16x2_t s_ = { { simde_uint8x16_from_private(a_[0]), - simde_uint8x16_from_private(a_[1]) } }; + simde_uint8x16_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_u8(ptr + 16*i); + } + simde_uint8x16x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u8_x2 #define vld1q_u8_x2(a) simde_vld1q_u8_x2((a)) #endif @@ -276,21 +245,17 @@ simde_vld1q_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u16_x2(ptr); #else - simde_uint16x8_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); - a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); - #else - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_uint16x8x2_t s_ = { { simde_uint16x8_from_private(a_[0]), - simde_uint16x8_from_private(a_[1]) } }; + simde_uint16x8_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_u16(ptr + 8*i); + } + simde_uint16x8x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u16_x2 #define vld1q_u16_x2(a) simde_vld1q_u16_x2((a)) #endif @@ -304,21 +269,17 @@ simde_vld1q_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u32_x2(ptr); #else - simde_uint32x4_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); - a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); - #else - for (size_t i = 0; i < 8; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } - #endif - simde_uint32x4x2_t s_ = { { simde_uint32x4_from_private(a_[0]), - simde_uint32x4_from_private(a_[1]) } }; + simde_uint32x4_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_u32(ptr + 4*i); + } + simde_uint32x4x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u32_x2 #define vld1q_u32_x2(a) simde_vld1q_u32_x2((a)) #endif @@ -332,22 +293,18 @@ simde_vld1q_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u64_x2(ptr); #else - simde_uint64x2_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); - #else - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_uint64x2x2_t s_ = { { simde_uint64x2_from_private(a_[0]), - simde_uint64x2_from_private(a_[1]) } }; + simde_uint64x2_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_u64(ptr + 2*i); + } + simde_uint64x2x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) - #undef vld1q_u64_x2 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) +#undef vld1q_u64_x2 #define vld1q_u64_x2(a) simde_vld1q_u64_x2((a)) #endif @@ -359,21 +316,16 @@ simde_vld1q_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_p8_x2(ptr); #else - simde_poly8x16_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); - a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); - #else - for (size_t i = 0; i < 32; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - #endif - simde_poly8x16x2_t s_ = { { simde_poly8x16_from_private(a_[0]), - simde_poly8x16_from_private(a_[1]) } }; + simde_poly8x16_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_p8(ptr + 8*i); + } + simde_poly8x16x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_p8_x2 #define vld1q_p8_x2(a) simde_vld1q_p8_x2((a)) #endif @@ -386,21 +338,16 @@ simde_vld1q_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_p16_x2(ptr); #else - simde_poly16x8_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); - a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); - #else - for (size_t i = 0; i < 16; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_poly16x8x2_t s_ = { { simde_poly16x8_from_private(a_[0]), - simde_poly16x8_from_private(a_[1]) } }; + simde_poly16x8_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_p16(ptr + 4*i); + } + simde_poly16x8x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_p16_x2 #define vld1q_p16_x2(a) simde_vld1q_p16_x2((a)) #endif @@ -413,21 +360,16 @@ simde_vld1q_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_p64_x2(ptr); #else - simde_poly64x2_private a_[2]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); - #else - for (size_t i = 0; i < 4; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_poly64x2x2_t s_ = { { simde_poly64x2_from_private(a_[0]), - simde_poly64x2_from_private(a_[1]) } }; + simde_poly64x2_t a_[2]; + for (size_t i = 0; i < 2; i++) { + a_[i] = simde_vld1q_p64(ptr + 2*i); + } + simde_poly64x2x2_t s_ = { { a_[0], a_[1] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_p64_x2 #define vld1q_p64_x2(a) simde_vld1q_p64_x2((a)) #endif @@ -447,12 +389,12 @@ simde_vld1q_bf16_x2(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_bf16_x2 #define vld1q_bf16_x2(a) simde_vld1q_bf16_x2((a)) #endif - #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/thirdparty/simde/arm/neon/ld1q_x3.h b/thirdparty/simde/arm/neon/ld1q_x3.h index ec82989e7..c34109613 100644 --- a/thirdparty/simde/arm/neon/ld1q_x3.h +++ b/thirdparty/simde/arm/neon/ld1q_x3.h @@ -25,17 +25,19 @@ * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_ARM_NEON_LD1Q_X3_H) #define SIMDE_ARM_NEON_LD1Q_X3_H #include "types.h" +#include "ld1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -50,23 +52,18 @@ simde_vld1q_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_f16_x3(ptr); #else - simde_float16x8_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); - a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); - a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8); - #else - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_float16x8x3_t s_ = { { simde_float16x8_from_private(a_[0]), - simde_float16x8_from_private(a_[1]), - simde_float16x8_from_private(a_[2]) } }; + simde_float16x8_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_f16(ptr + 8*i); + } + simde_float16x8x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_f16_x3 #define vld1q_f16_x3(a) simde_vld1q_f16_x3((a)) #endif @@ -80,23 +77,17 @@ simde_vld1q_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(12)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_f32_x3(ptr); #else - simde_float32x4_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); - a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); - a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4); - #else - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } - #endif - simde_float32x4x3_t s_ = { { simde_float32x4_from_private(a_[0]), - simde_float32x4_from_private(a_[1]), - simde_float32x4_from_private(a_[2]) } }; + simde_float32x4_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_f32(ptr + 4*i); + } + simde_float32x4x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_f32_x3 #define vld1q_f32_x3(a) simde_vld1q_f32_x3((a)) #endif @@ -110,23 +101,17 @@ simde_vld1q_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(6)]) { (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vld1q_f64_x3(ptr); #else - simde_float64x2_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); - a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2); - #else - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_float64x2x3_t s_ = { { simde_float64x2_from_private(a_[0]), - simde_float64x2_from_private(a_[1]), - simde_float64x2_from_private(a_[2]) } }; + simde_float64x2_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_f64(ptr + 2*i); + } + simde_float64x2x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vld1q_f64_x3 #define vld1q_f64_x3(a) simde_vld1q_f64_x3((a)) #endif @@ -140,113 +125,89 @@ simde_vld1q_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s8_x3(ptr); #else - simde_int8x16_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); - a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); - a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16); - #else - for (size_t i = 0; i < 48; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - #endif - simde_int8x16x3_t s_ = { { simde_int8x16_from_private(a_[0]), - simde_int8x16_from_private(a_[1]), - simde_int8x16_from_private(a_[2]) } }; + simde_int8x16_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_s8(ptr + 16*i); + } + simde_int8x16x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s8_x3 #define vld1q_s8_x3(a) simde_vld1q_s8_x3((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_int16x8x3_t -simde_vld1q_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { +simde_vld1q_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { #if \ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s16_x3(ptr); #else - simde_int16x8_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); - a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); - a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8); - #else - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_int16x8x3_t s_ = { { simde_int16x8_from_private(a_[0]), - simde_int16x8_from_private(a_[1]), - simde_int16x8_from_private(a_[2]) } }; + simde_int16x8_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_s16(ptr + 8*i); + } + simde_int16x8x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s16_x3 #define vld1q_s16_x3(a) simde_vld1q_s16_x3((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_int32x4x3_t -simde_vld1q_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { +simde_vld1q_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { #if \ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s32_x3(ptr); #else - simde_int32x4_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); - a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); - a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4); - #else - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } - #endif - simde_int32x4x3_t s_ = { { simde_int32x4_from_private(a_[0]), - simde_int32x4_from_private(a_[1]), - simde_int32x4_from_private(a_[2]) } }; + simde_int32x4_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_s32(ptr + 4*i); + } + simde_int32x4x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s32_x3 #define vld1q_s32_x3(a) simde_vld1q_s32_x3((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_int64x2x3_t -simde_vld1q_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { +simde_vld1q_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { #if \ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s64_x3(ptr); #else - simde_int64x2_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); - a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2); - #else - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_int64x2x3_t s_ = { { simde_int64x2_from_private(a_[0]), - simde_int64x2_from_private(a_[1]), - simde_int64x2_from_private(a_[2]) } }; + simde_int64x2_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_s64(ptr + 2*i); + } + simde_int64x2x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s64_x3 #define vld1q_s64_x3(a) simde_vld1q_s64_x3((a)) #endif @@ -260,23 +221,17 @@ simde_vld1q_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u8_x3(ptr); #else - simde_uint8x16_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); - a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); - a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); - #else - for (size_t i = 0; i < 48; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - #endif - simde_uint8x16x3_t s_ = { { simde_uint8x16_from_private(a_[0]), - simde_uint8x16_from_private(a_[1]), - simde_uint8x16_from_private(a_[2]) } }; + simde_uint8x16_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_u8(ptr + 16*i); + } + simde_uint8x16x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u8_x3 #define vld1q_u8_x3(a) simde_vld1q_u8_x3((a)) #endif @@ -290,83 +245,65 @@ simde_vld1q_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u16_x3(ptr); #else - simde_uint16x8_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); - a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); - a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); - #else - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_uint16x8x3_t s_ = { { simde_uint16x8_from_private(a_[0]), - simde_uint16x8_from_private(a_[1]), - simde_uint16x8_from_private(a_[2]) } }; + simde_uint16x8_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_u16(ptr + 8*i); + } + simde_uint16x8x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u16_x3 #define vld1q_u16_x3(a) simde_vld1q_u16_x3((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4x3_t -simde_vld1q_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { +simde_vld1q_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { #if \ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u32_x3(ptr); #else - simde_uint32x4_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); - a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); - a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4); - #else - for (size_t i = 0; i < 12; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } - #endif - simde_uint32x4x3_t s_ = { { simde_uint32x4_from_private(a_[0]), - simde_uint32x4_from_private(a_[1]), - simde_uint32x4_from_private(a_[2]) } }; + simde_uint32x4_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_u32(ptr + 4*i); + } + simde_uint32x4x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u32_x3 #define vld1q_u32_x3(a) simde_vld1q_u32_x3((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2x3_t -simde_vld1q_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { +simde_vld1q_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { #if \ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u64_x3(ptr); #else - simde_uint64x2_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); - a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); - #else - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_uint64x2x3_t s_ = { { simde_uint64x2_from_private(a_[0]), - simde_uint64x2_from_private(a_[1]), - simde_uint64x2_from_private(a_[2]) } }; + simde_uint64x2_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_u64(ptr + 2*i); + } + simde_uint64x2x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u64_x3 #define vld1q_u64_x3(a) simde_vld1q_u64_x3((a)) #endif @@ -379,23 +316,16 @@ simde_vld1q_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_p8_x3(ptr); #else - simde_poly8x16_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); - a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); - a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); - #else - for (size_t i = 0; i < 48; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - #endif - simde_poly8x16x3_t s_ = { { simde_poly8x16_from_private(a_[0]), - simde_poly8x16_from_private(a_[1]), - simde_poly8x16_from_private(a_[2]) } }; + simde_poly8x16_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_p8(ptr + 8*i); + } + simde_poly8x16x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_p8_x3 #define vld1q_p8_x3(a) simde_vld1q_p8_x3((a)) #endif @@ -408,52 +338,38 @@ simde_vld1q_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_p16_x3(ptr); #else - simde_poly16x8_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); - a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); - a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); - #else - for (size_t i = 0; i < 24; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_poly16x8x3_t s_ = { { simde_poly16x8_from_private(a_[0]), - simde_poly16x8_from_private(a_[1]), - simde_poly16x8_from_private(a_[2]) } }; + simde_poly16x8_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_p16(ptr + 4*i); + } + simde_poly16x8x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_p16_x3 #define vld1q_p16_x3(a) simde_vld1q_p16_x3((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_poly64x2x3_t -simde_vld1q_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { +simde_vld1q_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { #if \ defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_p64_x3(ptr); #else - simde_poly64x2_private a_[3]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); - a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); - #else - for (size_t i = 0; i < 6; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_poly64x2x3_t s_ = { { simde_poly64x2_from_private(a_[0]), - simde_poly64x2_from_private(a_[1]), - simde_poly64x2_from_private(a_[2]) } }; + simde_poly64x2_t a_[3]; + for (size_t i = 0; i < 3; i++) { + a_[i] = simde_vld1q_p64(ptr + 2*i); + } + simde_poly64x2x3_t s_ = { { a_[0], a_[1], a_[2] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_p64_x3 #define vld1q_p64_x3(a) simde_vld1q_p64_x3((a)) #endif @@ -474,7 +390,8 @@ simde_vld1q_bf16_x3(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(24)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld1q_bf16_x3 #define vld1q_bf16_x3(a) simde_vld1q_bf16_x3((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld1q_x4.h b/thirdparty/simde/arm/neon/ld1q_x4.h index 2fa4c1a69..96e526777 100644 --- a/thirdparty/simde/arm/neon/ld1q_x4.h +++ b/thirdparty/simde/arm/neon/ld1q_x4.h @@ -26,17 +26,19 @@ * 2021 Décio Luiz Gazzoni Filho * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_ARM_NEON_LD1Q_X4_H) #define SIMDE_ARM_NEON_LD1Q_X4_H #include "types.h" +#include "ld1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -51,25 +53,18 @@ simde_vld1q_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_f16_x4(ptr); #else - simde_float16x8_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH - a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); - a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); - a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8); - a_[3].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+24) , 8); - #else - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_float16x8x4_t s_ = { { simde_float16x8_from_private(a_[0]), - simde_float16x8_from_private(a_[1]), - simde_float16x8_from_private(a_[2]), - simde_float16x8_from_private(a_[3]) } }; + simde_float16x8_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_f16(ptr + 8*i); + } + simde_float16x8x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_f16_x4 #define vld1q_f16_x4(a) simde_vld1q_f16_x4((a)) #endif @@ -83,25 +78,17 @@ simde_vld1q_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_f32_x4(ptr); #else - simde_float32x4_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); - a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); - a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4); - a_[3].sv128 = __riscv_vle32_v_f32m1(ptr+12 , 4); - #else - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } - #endif - simde_float32x4x4_t s_ = { { simde_float32x4_from_private(a_[0]), - simde_float32x4_from_private(a_[1]), - simde_float32x4_from_private(a_[2]), - simde_float32x4_from_private(a_[3]) } }; + simde_float32x4_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_f32(ptr + 4*i); + } + simde_float32x4x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_f32_x4 #define vld1q_f32_x4(a) simde_vld1q_f32_x4((a)) #endif @@ -115,25 +102,17 @@ simde_vld1q_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(8)]) { (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vld1q_f64_x4(ptr); #else - simde_float64x2_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); - a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2); - a_[3].sv128 = __riscv_vle64_v_f64m1(ptr+6 , 2); - #else - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_float64x2x4_t s_ = { { simde_float64x2_from_private(a_[0]), - simde_float64x2_from_private(a_[1]), - simde_float64x2_from_private(a_[2]), - simde_float64x2_from_private(a_[3]) } }; + simde_float64x2_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_f64(ptr + 2*i); + } + simde_float64x2x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)))) #undef vld1q_f64_x4 #define vld1q_f64_x4(a) simde_vld1q_f64_x4((a)) #endif @@ -147,25 +126,17 @@ simde_vld1q_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s8_x4(ptr); #else - simde_int8x16_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); - a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); - a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16); - a_[3].sv128 = __riscv_vle8_v_i8m1(ptr+48 , 16); - #else - for (size_t i = 0; i < 64; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - #endif - simde_int8x16x4_t s_ = { { simde_int8x16_from_private(a_[0]), - simde_int8x16_from_private(a_[1]), - simde_int8x16_from_private(a_[2]), - simde_int8x16_from_private(a_[3]) } }; + simde_int8x16_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_s8(ptr + 16*i); + } + simde_int8x16x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s8_x4 #define vld1q_s8_x4(a) simde_vld1q_s8_x4((a)) #endif @@ -179,25 +150,17 @@ simde_vld1q_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s16_x4(ptr); #else - simde_int16x8_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); - a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); - a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8); - a_[3].sv128 = __riscv_vle16_v_i16m1(ptr+24 , 8); - #else - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_int16x8x4_t s_ = { { simde_int16x8_from_private(a_[0]), - simde_int16x8_from_private(a_[1]), - simde_int16x8_from_private(a_[2]), - simde_int16x8_from_private(a_[3]) } }; + simde_int16x8_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_s16(ptr + 8*i); + } + simde_int16x8x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s16_x4 #define vld1q_s16_x4(a) simde_vld1q_s16_x4((a)) #endif @@ -211,25 +174,17 @@ simde_vld1q_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s32_x4(ptr); #else - simde_int32x4_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); - a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); - a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4); - a_[3].sv128 = __riscv_vle32_v_i32m1(ptr+12 , 4); - #else - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } - #endif - simde_int32x4x4_t s_ = { { simde_int32x4_from_private(a_[0]), - simde_int32x4_from_private(a_[1]), - simde_int32x4_from_private(a_[2]), - simde_int32x4_from_private(a_[3]) } }; + simde_int32x4_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_s32(ptr + 4*i); + } + simde_int32x4x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s32_x4 #define vld1q_s32_x4(a) simde_vld1q_s32_x4((a)) #endif @@ -243,25 +198,17 @@ simde_vld1q_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_s64_x4(ptr); #else - simde_int64x2_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); - a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2); - a_[3].sv128 = __riscv_vle64_v_i64m1(ptr+6 , 2); - #else - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_int64x2x4_t s_ = { { simde_int64x2_from_private(a_[0]), - simde_int64x2_from_private(a_[1]), - simde_int64x2_from_private(a_[1]), - simde_int64x2_from_private(a_[1]) } }; + simde_int64x2_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_s64(ptr + 2*i); + } + simde_int64x2x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_s64_x4 #define vld1q_s64_x4(a) simde_vld1q_s64_x4((a)) #endif @@ -275,25 +222,17 @@ simde_vld1q_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u8_x4(ptr); #else - simde_uint8x16_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); - a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); - a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); - a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16); - #else - for (size_t i = 0; i < 64; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - #endif - simde_uint8x16x4_t s_ = { { simde_uint8x16_from_private(a_[0]), - simde_uint8x16_from_private(a_[1]), - simde_uint8x16_from_private(a_[2]), - simde_uint8x16_from_private(a_[3]) } }; + simde_uint8x16_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_u8(ptr + 16*i); + } + simde_uint8x16x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u8_x4 #define vld1q_u8_x4(a) simde_vld1q_u8_x4((a)) #endif @@ -307,25 +246,17 @@ simde_vld1q_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u16_x4(ptr); #else - simde_uint16x8_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); - a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); - a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); - a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8); - #else - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_uint16x8x4_t s_ = { { simde_uint16x8_from_private(a_[0]), - simde_uint16x8_from_private(a_[1]), - simde_uint16x8_from_private(a_[2]), - simde_uint16x8_from_private(a_[3]) } }; + simde_uint16x8_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_u16(ptr + 8*i); + } + simde_uint16x8x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u16_x4 #define vld1q_u16_x4(a) simde_vld1q_u16_x4((a)) #endif @@ -339,25 +270,17 @@ simde_vld1q_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u32_x4(ptr); #else - simde_uint32x4_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); - a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); - a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4); - a_[3].sv128 = __riscv_vle32_v_u32m1(ptr+12 , 4); - #else - for (size_t i = 0; i < 16; i++) { - a_[i / 4].values[i % 4] = ptr[i]; - } - #endif - simde_uint32x4x4_t s_ = { { simde_uint32x4_from_private(a_[0]), - simde_uint32x4_from_private(a_[1]), - simde_uint32x4_from_private(a_[2]), - simde_uint32x4_from_private(a_[3]) } }; + simde_uint32x4_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_u32(ptr + 4*i); + } + simde_uint32x4x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u32_x4 #define vld1q_u32_x4(a) simde_vld1q_u32_x4((a)) #endif @@ -371,25 +294,17 @@ simde_vld1q_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_u64_x4(ptr); #else - simde_uint64x2_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); - a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); - a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2); - #else - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_uint64x2x4_t s_ = { { simde_uint64x2_from_private(a_[0]), - simde_uint64x2_from_private(a_[1]), - simde_uint64x2_from_private(a_[2]), - simde_uint64x2_from_private(a_[3]) } }; + simde_uint64x2_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_u64(ptr + 2*i); + } + simde_uint64x2x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))))) #undef vld1q_u64_x4 #define vld1q_u64_x4(a) simde_vld1q_u64_x4((a)) #endif @@ -402,25 +317,16 @@ simde_vld1q_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_p8_x4(ptr); #else - simde_poly8x16_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); - a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); - a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); - a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16); - #else - for (size_t i = 0; i < 64; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - #endif - simde_poly8x16x4_t s_ = { { simde_poly8x16_from_private(a_[0]), - simde_poly8x16_from_private(a_[1]), - simde_poly8x16_from_private(a_[2]), - simde_poly8x16_from_private(a_[3]) } }; + simde_poly8x16_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_p8(ptr + 8*i); + } + simde_poly8x16x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vld1q_p8_x4 #define vld1q_p8_x4(a) simde_vld1q_p8_x4((a)) #endif @@ -433,25 +339,16 @@ simde_vld1q_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_p16_x4(ptr); #else - simde_poly16x8_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); - a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); - a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); - a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8); - #else - for (size_t i = 0; i < 32; i++) { - a_[i / 8].values[i % 8] = ptr[i]; - } - #endif - simde_poly16x8x4_t s_ = { { simde_poly16x8_from_private(a_[0]), - simde_poly16x8_from_private(a_[1]), - simde_poly16x8_from_private(a_[2]), - simde_poly16x8_from_private(a_[3]) } }; + simde_poly16x8_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_p16(ptr + 4*i); + } + simde_poly16x8x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vld1q_p16_x4 #define vld1q_p16_x4(a) simde_vld1q_p16_x4((a)) #endif @@ -464,25 +361,16 @@ simde_vld1q_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) return vld1q_p64_x4(ptr); #else - simde_poly64x2_private a_[4]; - #if defined(SIMDE_RISCV_V_NATIVE) - a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); - a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); - a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); - a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2); - #else - for (size_t i = 0; i < 8; i++) { - a_[i / 2].values[i % 2] = ptr[i]; - } - #endif - simde_poly64x2x4_t s_ = { { simde_poly64x2_from_private(a_[0]), - simde_poly64x2_from_private(a_[1]), - simde_poly64x2_from_private(a_[2]), - simde_poly64x2_from_private(a_[3]) } }; + simde_poly64x2_t a_[4]; + for (size_t i = 0; i < 4; i++) { + a_[i] = simde_vld1q_p64(ptr + 2*i); + } + simde_poly64x2x4_t s_ = { { a_[0], a_[1], a_[2], a_[3] } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vld1q_p64_x4 #define vld1q_p64_x4(a) simde_vld1q_p64_x4((a)) #endif @@ -504,7 +392,8 @@ simde_vld1q_bf16_x4(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(32)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld1q_bf16_x4 #define vld1q_bf16_x4(a) simde_vld1q_bf16_x4((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld2.h b/thirdparty/simde/arm/neon/ld2.h index 5d0be9f33..72ab47854 100644 --- a/thirdparty/simde/arm/neon/ld2.h +++ b/thirdparty/simde/arm/neon/ld2.h @@ -37,7 +37,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -455,7 +455,8 @@ simde_vld2_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld2_f16 #define vld2_f16(a) simde_vld2_f16((a)) #endif @@ -712,6 +713,12 @@ simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_int64x2_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -726,6 +733,11 @@ simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -899,6 +911,12 @@ simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_uint64x2_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -913,6 +931,11 @@ simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -959,7 +982,8 @@ simde_vld2q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld2q_f16 #define vld2q_f16(a) simde_vld2q_f16((a)) #endif @@ -1030,6 +1054,11 @@ simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { } }; return r; #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_float64x2_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -1042,6 +1071,10 @@ simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_float64x2_from_private(r_[0]), simde_float64x2_from_private(r_[1]), } }; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH) + HEDLEY_DIAGNOSTIC_POP + #endif return r; #endif @@ -1127,6 +1160,10 @@ simde_vld2_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vld2_p64(ptr); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_poly64x1_private r_[2]; #if defined(SIMDE_RISCV_V_NATIVE) @@ -1140,6 +1177,9 @@ simde_vld2_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { } } #endif + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_poly64x1x2_t r = { { simde_poly64x1_from_private(r_[0]), @@ -1240,6 +1280,12 @@ simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_p64(ptr); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_poly64x2_private r_[2]; #if defined(SIMDE_RISCV_V_NATIVE) @@ -1259,6 +1305,11 @@ simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_poly64x2_from_private(r_[1]), } }; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && \ + ((HEDLEY_GCC_VERSION_CHECK(16,0,0) && defined(SIMDE_ARCH_RISCV64)) || \ + (defined(HEDLEY_GCC_VERSION) && defined(SIMDE_ARCH_LOONGARCH))) + HEDLEY_DIAGNOSTIC_POP + #endif return r; #endif } @@ -1289,7 +1340,8 @@ simde_vld2_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld2_bf16 #define vld2_bf16(a) simde_vld2_bf16((a)) #endif @@ -1316,7 +1368,8 @@ simde_vld2q_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld2q_bf16 #define vld2q_bf16(a) simde_vld2q_bf16((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld2_dup.h b/thirdparty/simde/arm/neon/ld2_dup.h index 238807ab7..a06a13edc 100644 --- a/thirdparty/simde/arm/neon/ld2_dup.h +++ b/thirdparty/simde/arm/neon/ld2_dup.h @@ -48,7 +48,8 @@ simde_vld2_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld2_dup_f16 #define vld2_dup_f16(a) simde_vld2_dup_f16((a)) #endif @@ -257,7 +258,8 @@ simde_vld2q_dup_f16(simde_float16_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld2q_dup_f16 #define vld2q_dup_f16(a) simde_vld2q_dup_f16((a)) #endif @@ -524,7 +526,9 @@ simde_vld2q_dup_p8(simde_poly8_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_95399) && \ + !defined(SIMDE_BUG_CLANG_71763))) #undef vld2q_dup_p8 #define vld2q_dup_p8(a) simde_vld2q_dup_p8((a)) #endif @@ -544,7 +548,9 @@ simde_vld2q_dup_p16(simde_poly16_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_95399) && \ + !defined(SIMDE_BUG_CLANG_71763))) #undef vld2q_dup_p16 #define vld2q_dup_p16(a) simde_vld2q_dup_p16((a)) #endif @@ -582,7 +588,8 @@ simde_vld2_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld2_dup_bf16 #define vld2_dup_bf16(a) simde_vld2_dup_bf16((a)) #endif @@ -601,7 +608,8 @@ simde_vld2q_dup_bf16(simde_bfloat16 const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld2q_dup_bf16 #define vld2q_dup_bf16(a) simde_vld2q_dup_bf16((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld2_lane.h b/thirdparty/simde/arm/neon/ld2_lane.h index 81b29dd20..b66cd456c 100644 --- a/thirdparty/simde/arm/neon/ld2_lane.h +++ b/thirdparty/simde/arm/neon/ld2_lane.h @@ -208,7 +208,8 @@ simde_float16x4x2_t simde_vld2_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_P #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vld2_lane_f16(ptr, src, lane) vld2_lane_f16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld2_lane_f16 #define vld2_lane_f16(ptr, src, lane) simde_vld2_lane_f16((ptr), (src), (lane)) #endif @@ -427,7 +428,8 @@ simde_float16x8x2_t simde_vld2q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vld2q_lane_f16(ptr, src, lane) vld2q_lane_f16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld2q_lane_f16 #define vld2q_lane_f16(ptr, src, lane) simde_vld2q_lane_f16((ptr), (src), (lane)) #endif @@ -607,7 +609,8 @@ simde_bfloat16x4x2_t simde_vld2_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRA #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vld2_lane_bf16(ptr, src, lane) vld2_lane_bf16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld2_lane_bf16 #define vld2_lane_bf16(ptr, src, lane) simde_vld2_lane_bf16((ptr), (src), (lane)) #endif @@ -627,7 +630,8 @@ simde_bfloat16x8x2_t simde_vld2q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARR #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vld2q_lane_bf16(ptr, src, lane) vld2q_lane_bf16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld2q_lane_bf16 #define vld2q_lane_bf16(ptr, src, lane) simde_vld2q_lane_bf16((ptr), (src), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/ld3.h b/thirdparty/simde/arm/neon/ld3.h index a60c2aa0d..2361a8968 100644 --- a/thirdparty/simde/arm/neon/ld3.h +++ b/thirdparty/simde/arm/neon/ld3.h @@ -36,7 +36,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -70,7 +70,8 @@ simde_vld3_f16(simde_float16_t const *ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld3_f16 #define vld3_f16(a) simde_vld3_f16((a)) #endif @@ -433,7 +434,8 @@ simde_vld3q_f16(simde_float16_t const *ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld3q_f16 #define vld3q_f16(a) simde_vld3q_f16((a)) #endif @@ -1072,7 +1074,8 @@ simde_vld3_bf16(simde_bfloat16 const *ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld3_bf16 #define vld3_bf16(a) simde_vld3_bf16((a)) #endif @@ -1100,7 +1103,8 @@ simde_vld3q_bf16(simde_bfloat16 const *ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld3q_bf16 #define vld3q_bf16(a) simde_vld3q_bf16((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld3_dup.h b/thirdparty/simde/arm/neon/ld3_dup.h index 25f133b69..86d604481 100644 --- a/thirdparty/simde/arm/neon/ld3_dup.h +++ b/thirdparty/simde/arm/neon/ld3_dup.h @@ -48,7 +48,8 @@ simde_vld3_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld3_dup_f16 #define vld3_dup_f16(a) simde_vld3_dup_f16((a)) #endif @@ -257,7 +258,8 @@ simde_vld3q_dup_f16(simde_float16_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld3q_dup_f16 #define vld3q_dup_f16(a) simde_vld3q_dup_f16((a)) #endif @@ -523,7 +525,8 @@ simde_vld3q_dup_p8(simde_poly8_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) #undef vld3q_dup_p8 #define vld3q_dup_p8(a) simde_vld3q_dup_p8((a)) #endif @@ -542,7 +545,8 @@ simde_vld3q_dup_p16(simde_poly16_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) #undef vld3q_dup_p16 #define vld3q_dup_p16(a) simde_vld3q_dup_p16((a)) #endif @@ -580,7 +584,8 @@ simde_vld3_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld3_dup_bf16 #define vld3_dup_bf16(a) simde_vld3_dup_bf16((a)) #endif @@ -599,7 +604,8 @@ simde_vld3q_dup_bf16(simde_bfloat16 const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld3q_dup_bf16 #define vld3q_dup_bf16(a) simde_vld3q_dup_bf16((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld3_lane.h b/thirdparty/simde/arm/neon/ld3_lane.h index 4950792a8..5072dd4ae 100644 --- a/thirdparty/simde/arm/neon/ld3_lane.h +++ b/thirdparty/simde/arm/neon/ld3_lane.h @@ -208,7 +208,8 @@ simde_float16x4x3_t simde_vld3_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_P #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vld3_lane_f16(ptr, src, lane) vld3_lane_f16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld3_lane_f16 #define vld3_lane_f16(ptr, src, lane) simde_vld3_lane_f16((ptr), (src), (lane)) #endif @@ -427,7 +428,8 @@ simde_float16x8x3_t simde_vld3q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld3q_lane_f16 #define vld3q_lane_f16(ptr, src, lane) simde_vld3q_lane_f16((ptr), (src), (lane)) #endif @@ -607,7 +609,8 @@ simde_bfloat16x4x3_t simde_vld3_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRA #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vld3_lane_bf16(ptr, src, lane) vld3_lane_bf16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld3_lane_bf16 #define vld3_lane_bf16(ptr, src, lane) simde_vld3_lane_bf16((ptr), (src), (lane)) #endif @@ -627,7 +630,8 @@ simde_bfloat16x8x3_t simde_vld3q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARR #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vld3q_lane_bf16(ptr, src, lane) vld3q_lane_bf16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld3q_lane_bf16 #define vld3q_lane_bf16(ptr, src, lane) simde_vld3q_lane_bf16((ptr), (src), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/ld4.h b/thirdparty/simde/arm/neon/ld4.h index 777c24f73..85a15e194 100644 --- a/thirdparty/simde/arm/neon/ld4.h +++ b/thirdparty/simde/arm/neon/ld4.h @@ -35,7 +35,7 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if HEDLEY_GCC_VERSION_CHECK(7,0,0) - SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_BEGIN_DECLS_ @@ -64,7 +64,8 @@ simde_vld4_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return (s_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld4_f16 #define vld4_f16(a) simde_vld4_f16((a)) #endif @@ -372,7 +373,8 @@ simde_vld4q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld4q_f16 #define vld4q_f16(a) simde_vld4q_f16((a)) #endif @@ -884,7 +886,8 @@ simde_vld4_bf16(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) { return (s_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld4_bf16 #define vld4_bf16(a) simde_vld4_bf16((a)) #endif @@ -904,7 +907,8 @@ simde_vld4q_bf16(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(32)]) { return s_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld4q_bf16 #define vld4q_bf16(a) simde_vld4q_bf16((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld4_dup.h b/thirdparty/simde/arm/neon/ld4_dup.h index c2100af14..a974b4fd5 100644 --- a/thirdparty/simde/arm/neon/ld4_dup.h +++ b/thirdparty/simde/arm/neon/ld4_dup.h @@ -48,7 +48,8 @@ simde_vld4_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld4_dup_f16 #define vld4_dup_f16(a) simde_vld4_dup_f16((a)) #endif @@ -257,7 +258,8 @@ simde_vld4q_dup_f16(simde_float16_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld4q_dup_f16 #define vld4q_dup_f16(a) simde_vld4q_dup_f16((a)) #endif @@ -485,7 +487,8 @@ simde_vld4_dup_p16(simde_poly16_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) #undef vld4_dup_p16 #define vld4_dup_p16(a) simde_vld4_dup_p16((a)) #endif @@ -523,7 +526,8 @@ simde_vld4q_dup_p8(simde_poly8_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) #undef vld4q_dup_p8 #define vld4q_dup_p8(a) simde_vld4q_dup_p8((a)) #endif @@ -542,7 +546,8 @@ simde_vld4q_dup_p16(simde_poly16_t const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_95399)) #undef vld4q_dup_p16 #define vld4q_dup_p16(a) simde_vld4q_dup_p16((a)) #endif @@ -580,7 +585,8 @@ simde_vld4_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld4_dup_bf16 #define vld4_dup_bf16(a) simde_vld4_dup_bf16((a)) #endif @@ -599,7 +605,8 @@ simde_vld4q_dup_bf16(simde_bfloat16 const * ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld4q_dup_bf16 #define vld4q_dup_bf16(a) simde_vld4q_dup_bf16((a)) #endif diff --git a/thirdparty/simde/arm/neon/ld4_lane.h b/thirdparty/simde/arm/neon/ld4_lane.h index ed8a7e4d2..cdcf079d4 100644 --- a/thirdparty/simde/arm/neon/ld4_lane.h +++ b/thirdparty/simde/arm/neon/ld4_lane.h @@ -285,7 +285,8 @@ simde_vld4_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_floa #define simde_vld4_lane_f16(ptr, src, lane) vld4_lane_f16(ptr, src, lane) #endif #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld4_lane_f16 #define vld4_lane_f16(ptr, src, lane) simde_vld4_lane_f16((ptr), (src), (lane)) #endif @@ -582,7 +583,8 @@ simde_vld4q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_flo #define simde_vld4q_lane_f16(ptr, src, lane) vld4q_lane_f16(ptr, src, lane) #endif #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vld4q_lane_f16 #define vld4q_lane_f16(ptr, src, lane) simde_vld4q_lane_f16((ptr), (src), (lane)) #endif @@ -790,7 +792,8 @@ simde_vld4_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_bf #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vld4_lane_bf16(ptr, src, lane) vld4_lane_bf16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld4_lane_bf16 #define vld4_lane_bf16(ptr, src, lane) simde_vld4_lane_bf16((ptr), (src), (lane)) #endif @@ -812,7 +815,8 @@ simde_vld4q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_b #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) #define simde_vld4q_lane_bf16(ptr, src, lane) vld4q_lane_bf16(ptr, src, lane) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vld4q_lane_bf16 #define vld4q_lane_bf16(ptr, src, lane) simde_vld4q_lane_bf16((ptr), (src), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/max.h b/thirdparty/simde/arm/neon/max.h index 04c38184a..c18bc2138 100644 --- a/thirdparty/simde/arm/neon/max.h +++ b/thirdparty/simde/arm/neon/max.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su */ #if !defined(SIMDE_ARM_NEON_MAX_H) @@ -54,7 +55,8 @@ simde_vmaxh_f16(simde_float16_t a, simde_float16_t b) { return simde_float16_from_float32(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vmaxh_f16 #define vmaxh_f16(a, b) simde_vmaxh_f16((a), (b)) #endif @@ -78,7 +80,8 @@ simde_vmax_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vmax_f16 #define vmax_f16(a, b) simde_vmax_f16((a), (b)) #endif @@ -94,14 +97,27 @@ simde_vmax_f32(simde_float32x2_t a, simde_float32x2_t b) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if defined(SIMDE_RISCV_V_NATIVE) #if !defined(SIMDE_FAST_NANS) - r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NANF); + vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2); + vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv64 , 2) , 512 , 2); + vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 2); + vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 2); + r_.sv64 = __riscv_vfmax_vv_f32m1_m(vab_mask , a_.sv64 , b_.sv64 , 2); + r_.sv64 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv64 , vab_mask , 2); #else - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + r_.sv64 = __riscv_vfmax_vv_f32m1(a_.sv64, b_.sv64, 2); #endif - } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if !defined(SIMDE_FAST_NANS) + r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NANF); + #else + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + #endif + } + #endif return simde_float32x2_from_private(r_); #endif @@ -122,14 +138,28 @@ simde_vmax_f64(simde_float64x1_t a, simde_float64x1_t b) { a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if defined(SIMDE_RISCV_V_NATIVE) #if !defined(SIMDE_FAST_NANS) - r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NAN); + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1); + vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv64 , 1) , 512 , 1); + vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 1); + vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 1); + r_.sv64 = __riscv_vfmax_vv_f64m1_m(vab_mask , a_.sv64 , b_.sv64 , 1); + r_.sv64 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv64, vab_mask , 1); #else - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + r_.sv64 = __riscv_vfmax_vv_f64m1(a_.sv64, b_.sv64, 1); #endif - } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + #if !defined(SIMDE_FAST_NANS) + r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NAN); + #else + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + #endif + } + #endif return simde_float64x1_from_private(r_); #endif @@ -152,10 +182,14 @@ simde_vmax_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64, b_.sv64, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int8x8_from_private(r_); #endif @@ -178,10 +212,14 @@ simde_vmax_s16(simde_int16x4_t a, simde_int16x4_t b) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64, b_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int16x4_from_private(r_); #endif @@ -204,10 +242,14 @@ simde_vmax_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64, b_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int32x2_from_private(r_); #endif @@ -228,10 +270,14 @@ simde_x_vmax_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmax_vv_i64m1(a_.sv64, b_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int64x1_from_private(r_); #endif @@ -250,10 +296,14 @@ simde_vmax_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmaxu_vv_u8m1(a_.sv64, b_.sv64, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint8x8_from_private(r_); #endif @@ -279,6 +329,8 @@ simde_vmax_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) /* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881656284 */ r_.m64 = _mm_add_pi16(b_.m64, _mm_subs_pu16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmaxu_vv_u16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -307,10 +359,14 @@ simde_vmax_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmaxu_vv_u32m1(a_.sv64, b_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint32x2_from_private(r_); #endif @@ -331,10 +387,14 @@ simde_x_vmax_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmaxu_vv_u64m1(a_.sv64, b_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint64x1_from_private(r_); #endif @@ -359,7 +419,8 @@ simde_vmaxq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vmaxq_f16 #define vmaxq_f16(a, b) simde_vmaxq_f16((a), (b)) #endif @@ -411,6 +472,17 @@ simde_vmaxq_f32(simde_float32x4_t a, simde_float32x4_t b) { #endif #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_max(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + #if !defined(SIMDE_FAST_NANS) + vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4); + vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv128 , 4) , 512 , 4); + vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 4); + vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 4); + r_.sv128 = __riscv_vfmax_vv_f32m1_m(vab_mask , a_.sv128 , b_.sv128 , 4); + r_.sv128 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv128 , vab_mask , 4); + #else + r_.sv128 = __riscv_vfmax_vv_f32m1(a_.sv128, b_.sv128, 4); + #endif #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -458,6 +530,18 @@ simde_vmaxq_f64(simde_float64x2_t a, simde_float64x2_t b) { #endif #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_max(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + #if !defined(SIMDE_FAST_NANS) + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2); + vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv128 , 2) , 512 , 2); + vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 2); + vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 2); + r_.sv128 = __riscv_vfmax_vv_f64m1_m(vab_mask , a_.sv128 , b_.sv128 , 2); + r_.sv128 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv128, vab_mask , 2); + #else + r_.sv128 = __riscv_vfmax_vv_f64m1(a_.sv128, b_.sv128, 2); + #endif #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -501,6 +585,15 @@ simde_vmaxq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.v128 = wasm_i8x16_max(a_.v128, b_.v128); #endif + return simde_int8x16_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b); + + r_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128, b_.sv128, 16); + return simde_int8x16_from_private(r_); #else return simde_vbslq_s8(simde_vcgtq_s8(a, b), a, b); @@ -532,6 +625,15 @@ simde_vmaxq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.v128 = wasm_i16x8_max(a_.v128, b_.v128); #endif + return simde_int16x8_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + + r_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128, b_.sv128, 8); + return simde_int16x8_from_private(r_); #else return simde_vbslq_s16(simde_vcgtq_s16(a, b), a, b); @@ -563,6 +665,15 @@ simde_vmaxq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.v128 = wasm_i32x4_max(a_.v128, b_.v128); #endif + return simde_int32x4_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + + r_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128, b_.sv128, 4); + return simde_int32x4_from_private(r_); #else return simde_vbslq_s32(simde_vcgtq_s32(a, b), a, b); @@ -578,6 +689,15 @@ simde_int64x2_t simde_x_vmaxq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) return vec_max(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b); + + r_.sv128 = __riscv_vmax_vv_i64m1(a_.sv128, b_.sv128, 2); + + return simde_int64x2_from_private(r_); #else return simde_vbslq_s64(simde_vcgtq_s64(a, b), a, b); #endif @@ -604,6 +724,15 @@ simde_vmaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.v128 = wasm_u8x16_max(a_.v128, b_.v128); #endif + return simde_uint8x16_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b); + + r_.sv128 = __riscv_vmaxu_vv_u8m1(a_.sv128, b_.sv128, 16); + return simde_uint8x16_from_private(r_); #else return simde_vbslq_u8(simde_vcgtq_u8(a, b), a, b); @@ -638,6 +767,15 @@ simde_vmaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.v128 = wasm_u16x8_max(a_.v128, b_.v128); #endif + return simde_uint16x8_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + + r_.sv128 = __riscv_vmaxu_vv_u16m1(a_.sv128, b_.sv128, 8); + return simde_uint16x8_from_private(r_); #else return simde_vbslq_u16(simde_vcgtq_u16(a, b), a, b); @@ -669,6 +807,15 @@ simde_vmaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.v128 = wasm_u32x4_max(a_.v128, b_.v128); #endif + return simde_uint32x4_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + + r_.sv128 = __riscv_vmaxu_vv_u32m1(a_.sv128, b_.sv128, 4); + return simde_uint32x4_from_private(r_); #else return simde_vbslq_u32(simde_vcgtq_u32(a, b), a, b); @@ -684,6 +831,15 @@ simde_uint64x2_t simde_x_vmaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) return vec_max(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + + r_.sv128 = __riscv_vmaxu_vv_u64m1(a_.sv128, b_.sv128, 2); + + return simde_uint64x2_from_private(r_); #else return simde_vbslq_u64(simde_vcgtq_u64(a, b), a, b); #endif diff --git a/thirdparty/simde/arm/neon/maxnm.h b/thirdparty/simde/arm/neon/maxnm.h index 8bee88054..c8ed1b557 100644 --- a/thirdparty/simde/arm/neon/maxnm.h +++ b/thirdparty/simde/arm/neon/maxnm.h @@ -61,7 +61,8 @@ simde_vmaxnmh_f16(simde_float16_t a, simde_float16_t b) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) #undef vmaxnmh_f16 #define vmaxnmh_f16(a, b) simde_vmaxnmh_f16((a), (b)) #endif @@ -85,7 +86,8 @@ simde_vmaxnm_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) #undef vmaxnm_f16 #define vmaxnm_f16(a, b) simde_vmaxnm_f16((a), (b)) #endif @@ -109,7 +111,8 @@ simde_vmaxnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) #undef vmaxnmq_f16 #define vmaxnmq_f16(a, b) simde_vmaxnmq_f16((a), (b)) #endif @@ -145,7 +148,8 @@ simde_vmaxnm_f32(simde_float32x2_t a, simde_float32x2_t b) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6))) #undef vmaxnm_f32 #define vmaxnm_f32(a, b) simde_vmaxnm_f32((a), (b)) #endif @@ -233,7 +237,8 @@ simde_vmaxnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6))) #undef vmaxnmq_f32 #define vmaxnmq_f32(a, b) simde_vmaxnmq_f32((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/maxnmv.h b/thirdparty/simde/arm/neon/maxnmv.h index eba518874..6ca95e08e 100644 --- a/thirdparty/simde/arm/neon/maxnmv.h +++ b/thirdparty/simde/arm/neon/maxnmv.h @@ -155,7 +155,8 @@ simde_vmaxnmv_f16(simde_float16x4_t a) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vmaxnmv_f16 #define vmaxnmv_f16(v) simde_vmaxnmv_f16(v) #endif @@ -190,7 +191,8 @@ simde_vmaxnmvq_f16(simde_float16x8_t a) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vmaxnmvq_f16 #define vmaxnmvq_f16(v) simde_vmaxnmvq_f16(v) #endif diff --git a/thirdparty/simde/arm/neon/maxv.h b/thirdparty/simde/arm/neon/maxv.h index 39c9e0cae..bd545ad6c 100644 --- a/thirdparty/simde/arm/neon/maxv.h +++ b/thirdparty/simde/arm/neon/maxv.h @@ -62,7 +62,8 @@ simde_vmaxv_f16(simde_float16x4_t a) { return simde_float16_from_float32(r); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vmaxv_f16 #define vmaxv_f16(v) simde_vmaxv_f16(v) #endif @@ -262,7 +263,8 @@ simde_vmaxvq_f16(simde_float16x8_t a) { return simde_float16_from_float32(r); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vmaxvq_f16 #define vmaxvq_f16(v) simde_vmaxvq_f16(v) #endif diff --git a/thirdparty/simde/arm/neon/min.h b/thirdparty/simde/arm/neon/min.h index 469e65aa0..bcd201a36 100644 --- a/thirdparty/simde/arm/neon/min.h +++ b/thirdparty/simde/arm/neon/min.h @@ -54,7 +54,8 @@ simde_vminh_f16(simde_float16_t a, simde_float16_t b) { return simde_float16_from_float32(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vminh_f16 #define vminh_f16(a, b) simde_vminh_f16((a), (b)) #endif @@ -78,7 +79,8 @@ simde_vmin_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vmin_f16 #define vmin_f16(a, b) simde_vmin_f16((a), (b)) #endif @@ -88,6 +90,24 @@ simde_float32x2_t simde_vmin_f32(simde_float32x2_t a, simde_float32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmin_f32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + + #if !defined(SIMDE_FAST_NANS) + vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2); + vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv64 , 2) , 512 , 2); + vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 2); + vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 2); + r_.sv64 = __riscv_vfmin_vv_f32m1_m(vab_mask , a_.sv64 , b_.sv64 , 2); + r_.sv64 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv64 , vab_mask , 2); + #else + r_.sv64 = __riscv_vfmin_vv_f32m1(a_.sv64, b_.sv64, 2); + #endif + + return simde_float32x2_from_private(r_); #elif SIMDE_NATURAL_VECTOR_SIZE_GE(64) simde_float32x2_t r = simde_vbsl_f32(simde_vcgt_f32(b, a), a, b); @@ -130,6 +150,24 @@ simde_float64x1_t simde_vmin_f64(simde_float64x1_t a, simde_float64x1_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmin_f64(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + #if !defined(SIMDE_FAST_NANS) + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1); + vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv64 , 1) , 512 , 1); + vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 1); + vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 1); + r_.sv64 = __riscv_vfmin_vv_f64m1_m(vab_mask , a_.sv64 , b_.sv64 , 1); + r_.sv64 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv64, vab_mask , 1); + #else + r_.sv64 = __riscv_vfmin_vv_f64m1(a_.sv64, b_.sv64, 1); + #endif + + return simde_float64x1_from_private(r_); #elif SIMDE_NATURAL_VECTOR_SIZE_GE(64) simde_float64x1_t r = simde_vbsl_f64(simde_vcgt_f64(b, a), a, b); @@ -180,10 +218,14 @@ simde_vmin_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmin_vv_i8m1(a_.sv64, b_.sv64, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int8x8_from_private(r_); #endif @@ -206,10 +248,14 @@ simde_vmin_s16(simde_int16x4_t a, simde_int16x4_t b) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmin_vv_i16m1(a_.sv64, b_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int16x4_from_private(r_); #endif @@ -232,10 +278,14 @@ simde_vmin_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmin_vv_i32m1(a_.sv64, b_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int32x2_from_private(r_); #endif @@ -256,10 +306,14 @@ simde_x_vmin_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmin_vv_i64m1(a_.sv64, b_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_int64x1_from_private(r_); #endif @@ -278,10 +332,14 @@ simde_vmin_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vminu_vv_u8m1(a_.sv64, b_.sv64, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint8x8_from_private(r_); #endif @@ -307,6 +365,8 @@ simde_vmin_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) /* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881656284 */ r_.m64 = _mm_sub_pi16(a_.m64, _mm_subs_pu16(a_.m64, b_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vminu_vv_u16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -335,10 +395,14 @@ simde_vmin_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vminu_vv_u32m1(a_.sv64, b_.sv64, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint32x2_from_private(r_); #endif @@ -359,10 +423,14 @@ simde_x_vmin_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vminu_vv_u64m1(a_.sv64, b_.sv64, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint64x1_from_private(r_); #endif @@ -387,7 +455,8 @@ simde_vminq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vminq_f16 #define vminq_f16(a, b) simde_vminq_f16((a), (b)) #endif @@ -411,6 +480,17 @@ simde_vminq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128 = _mm_blendv_ps(_mm_set1_ps(SIMDE_MATH_NANF), _mm_min_ps(a_.m128, b_.m128), _mm_cmpord_ps(a_.m128, b_.m128)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + #if !defined(SIMDE_FAST_NANS) + vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4); + vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv128 , 4) , 512 , 4); + vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 4); + vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 4); + r_.sv128 = __riscv_vfmin_vv_f32m1_m(vab_mask , a_.sv128 , b_.sv128 , 4); + r_.sv128 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv128 , vab_mask , 4); + #else + r_.sv128 = __riscv_vfmin_vv_f32m1(a_.sv128, b_.sv128, 4); + #endif #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -455,6 +535,18 @@ simde_vminq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128d = _mm_blendv_pd(_mm_set1_pd(SIMDE_MATH_NAN), _mm_min_pd(a_.m128d, b_.m128d), _mm_cmpord_pd(a_.m128d, b_.m128d)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + #if !defined(SIMDE_FAST_NANS) + simde_float64 nan = SIMDE_MATH_NAN; + vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2); + vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv128 , 2) , 512 , 2); + vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 2); + vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 2); + r_.sv128 = __riscv_vfmin_vv_f64m1_m(vab_mask , a_.sv128 , b_.sv128 , 2); + r_.sv128 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv128, vab_mask , 2); + #else + r_.sv128 = __riscv_vfmin_vv_f64m1(a_.sv128, b_.sv128, 2); + #endif #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -497,6 +589,8 @@ simde_vminq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_min_epi8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmin_vv_i8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -529,6 +623,8 @@ simde_vminq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_min_epi16(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmin_vv_i16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -561,6 +657,8 @@ simde_vminq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_min_epi32(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmin_vv_i32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -589,6 +687,8 @@ simde_x_vminq_s64(simde_int64x2_t a, simde_int64x2_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm_min_epi64(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmin_vv_i64m1(a_.sv128, b_.sv128, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -617,6 +717,8 @@ simde_vminq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_min_epu8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vminu_vv_u8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -652,6 +754,8 @@ simde_vminq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_sub_epi16(a_.m128i, _mm_subs_epu16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u16x8_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vminu_vv_u16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -707,6 +811,8 @@ simde_vminq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u32x4_min(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vminu_vv_u32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -733,10 +839,14 @@ simde_x_vminq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vminu_vv_u64m1(a_.sv128, b_.sv128, 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } + #endif return simde_uint64x2_from_private(r_); #endif diff --git a/thirdparty/simde/arm/neon/minnm.h b/thirdparty/simde/arm/neon/minnm.h index 1f0a003e4..a72a01774 100644 --- a/thirdparty/simde/arm/neon/minnm.h +++ b/thirdparty/simde/arm/neon/minnm.h @@ -61,7 +61,8 @@ simde_vminnmh_f16(simde_float16_t a, simde_float16_t b) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) #undef vminnmh_f16 #define vminnmh_f16(a, b) simde_vminnmh_f16((a), (b)) #endif @@ -85,7 +86,8 @@ simde_vminnm_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) #undef vminnm_f16 #define vminnm_f16(a, b) simde_vminnm_f16((a), (b)) #endif @@ -121,7 +123,8 @@ simde_vminnm_f32(simde_float32x2_t a, simde_float32x2_t b) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6))) #undef vminnm_f32 #define vminnm_f32(a, b) simde_vminnm_f32((a), (b)) #endif @@ -181,7 +184,8 @@ simde_vminnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16))) #undef vminnmq_f16 #define vminnmq_f16(a, b) simde_vminnmq_f16((a), (b)) #endif @@ -234,7 +238,8 @@ simde_vminnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(__ARM_NEON_FP) && (__ARM_NEON_FP >= 6))) #undef vminnmq_f32 #define vminnmq_f32(a, b) simde_vminnmq_f32((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/minnmv.h b/thirdparty/simde/arm/neon/minnmv.h index 3d10486a0..648e8d90e 100644 --- a/thirdparty/simde/arm/neon/minnmv.h +++ b/thirdparty/simde/arm/neon/minnmv.h @@ -65,7 +65,8 @@ simde_vminnmv_f16(simde_float16x4_t a) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vminnmv_f16 #define vminnmv_f16(v) simde_vminnmv_f16(v) #endif @@ -137,7 +138,8 @@ simde_vminnmvq_f16(simde_float16x8_t a) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vminnmvq_f16 #define vminnmvq_f16(v) simde_vminnmvq_f16(v) #endif diff --git a/thirdparty/simde/arm/neon/minv.h b/thirdparty/simde/arm/neon/minv.h index 2c7b5e3f7..3ab62a703 100644 --- a/thirdparty/simde/arm/neon/minv.h +++ b/thirdparty/simde/arm/neon/minv.h @@ -62,7 +62,8 @@ simde_vminv_f16(simde_float16x4_t a) { return simde_float16_from_float32(r); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vminv_f16 #define vminv_f16(v) simde_vminv_f16(v) #endif @@ -270,7 +271,8 @@ simde_vminvq_f16(simde_float16x8_t a) { return simde_float16_from_float32(r); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vminvq_f16 #define vminvq_f16(v) simde_vminvq_f16(v) #endif diff --git a/thirdparty/simde/arm/neon/mls.h b/thirdparty/simde/arm/neon/mls.h index 0ee06a2b9..cff43085f 100644 --- a/thirdparty/simde/arm/neon/mls.h +++ b/thirdparty/simde/arm/neon/mls.h @@ -262,7 +262,7 @@ simde_vmlsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { return simde_vsubq_f64(a, simde_vmulq_f64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vmlsq_f64 #define vmlsq_f64(a, b, c) simde_vmlsq_f64((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/mmlaq.h b/thirdparty/simde/arm/neon/mmlaq.h index a56853853..b55882ec4 100644 --- a/thirdparty/simde/arm/neon/mmlaq.h +++ b/thirdparty/simde/arm/neon/mmlaq.h @@ -58,7 +58,8 @@ simde_vmmlaq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { return simde_int32x4_from_private(ret); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_MATMUL_INT8))) #undef vmmlaq_s32 #define vmmlaq_s32(r, a, b) simde_vmmlaq_s32((r), (a), (b)) #endif @@ -85,7 +86,8 @@ simde_vmmlaq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) { return simde_uint32x4_from_private(ret); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_MATMUL_INT8))) #undef vmmlaq_u32 #define vmmlaq_u32(r, a, b) simde_vmmlaq_u32((r), (a), (b)) #endif @@ -113,7 +115,8 @@ simde_vusmmlaq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b) { return simde_int32x4_from_private(ret); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_MATMUL_INT8))) #undef vusmmlaq_s32 #define vusmmlaq_s32(r, a, b) simde_vusmmlaq_s32((r), (a), (b)) #endif @@ -142,7 +145,9 @@ simde_vbfmmlaq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t return simde_float32x4_from_private(ret); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARCH_ARM_MATMUL_INT8) && \ + defined(SIMDE_ARM_NEON_BF16))) #undef vbfmmlaq_f32 #define vbfmmlaq_f32(r, a, b) simde_vbfmmlaq_f32((r), (a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/mul.h b/thirdparty/simde/arm/neon/mul.h index 590b0eae5..ce2b87c96 100644 --- a/thirdparty/simde/arm/neon/mul.h +++ b/thirdparty/simde/arm/neon/mul.h @@ -50,7 +50,7 @@ simde_vmulh_f16(simde_float16_t a, simde_float16_t b) { return simde_float16_from_float32(a_ * b_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulh_f16 #define vmulh_f16(a, b) simde_vmulh_f16((a), (b)) #endif @@ -76,7 +76,7 @@ simde_vmul_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmul_f16 #define vmul_f16(a, b) simde_vmul_f16((a), (b)) #endif @@ -388,7 +388,7 @@ simde_vmulq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulq_f16 #define vmulq_f16(a, b) simde_vmulq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/mul_lane.h b/thirdparty/simde/arm/neon/mul_lane.h index 72c032eea..540b57524 100644 --- a/thirdparty/simde/arm/neon/mul_lane.h +++ b/thirdparty/simde/arm/neon/mul_lane.h @@ -50,7 +50,7 @@ simde_vmulh_lane_f16(simde_float16_t a, simde_float16x4_t b, const int lane) #define simde_vmulh_lane_f16(a, b, lane) vmulh_lane_f16((a), (b), (lane)) #endif #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulh_lane_f16 #define vmulh_lane_f16(a, b, lane) simde_vmulh_lane_f16(a, b, lane) #endif @@ -126,7 +126,7 @@ simde_vmulh_laneq_f16(simde_float16_t a, simde_float16x8_t b, const int lane) #define simde_vmulh_laneq_f16(a, b, lane) vmulh_laneq_f16((a), (b), (lane)) #endif #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulh_laneq_f16 #define vmulh_laneq_f16(a, b, lane) simde_vmulh_laneq_f16(a, b, lane) #endif @@ -169,7 +169,7 @@ simde_vmul_lane_f16(simde_float16x4_t a, simde_float16x4_t b, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmul_lane_f16(a, b, lane) vmul_lane_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmul_lane_f16 #define vmul_lane_f16(a, b, lane) simde_vmul_lane_f16((a), (b), (lane)) #endif @@ -386,7 +386,7 @@ simde_vmul_laneq_s32(simde_int32x2_t a, simde_int32x4_t b, const int lane) #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (HEDLEY_STATIC_CAST(uint32_t, a_.values[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[lane]))); } #endif @@ -477,7 +477,7 @@ simde_vmulq_lane_f16(simde_float16x8_t a, simde_float16x4_t b, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmulq_lane_f16(a, b, lane) vmulq_lane_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulq_lane_f16 #define vmulq_lane_f16(a, b, lane) simde_vmulq_lane_f16((a), (b), (lane)) #endif @@ -669,7 +669,7 @@ simde_vmulq_laneq_f16(simde_float16x8_t a, simde_float16x8_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmulq_laneq_f16(a, b, lane) vmulq_laneq_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulq_laneq_f16 #define vmulq_laneq_f16(a, b, lane) simde_vmulq_laneq_f16((a), (b), (lane)) #endif @@ -772,7 +772,7 @@ simde_vmulq_laneq_s32(simde_int32x4_t a, simde_int32x4_t b, const int lane) #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (HEDLEY_STATIC_CAST(uint32_t, a_.values[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.values[lane]))); } #endif @@ -861,7 +861,7 @@ simde_vmul_laneq_f16(simde_float16x4_t a, simde_float16x8_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmul_laneq_f16(a, b, lane) vmul_laneq_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmul_laneq_f16 #define vmul_laneq_f16(a, b, lane) simde_vmul_laneq_f16((a), (b), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/mul_n.h b/thirdparty/simde/arm/neon/mul_n.h index 533754279..09b0cd611 100644 --- a/thirdparty/simde/arm/neon/mul_n.h +++ b/thirdparty/simde/arm/neon/mul_n.h @@ -46,7 +46,7 @@ simde_vmul_n_f16(simde_float16x4_t a, simde_float16_t b) { return simde_vmul_f16(a, simde_vdup_n_f16(b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmul_n_f16 #define vmul_n_f16(a, b) simde_vmul_n_f16((a), (b)) #endif @@ -144,7 +144,7 @@ simde_vmulq_n_f16(simde_float16x8_t a, simde_float16_t b) { return simde_vmulq_f16(a, simde_vdupq_n_f16(b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulq_n_f16 #define vmulq_n_f16(a, b) simde_vmulq_n_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/mull.h b/thirdparty/simde/arm/neon/mull.h index cd5c9112f..b88856b17 100644 --- a/thirdparty/simde/arm/neon/mull.h +++ b/thirdparty/simde/arm/neon/mull.h @@ -280,7 +280,7 @@ simde_vmull_p64(simde_poly64_t a, simde_poly64_t b) { return result; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_CRYPTO)) #undef vmull_p64 #define vmull_p64(a, b) simde_vmull_p64((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/mull_high.h b/thirdparty/simde/arm/neon/mull_high.h index 87e83369a..b5962c221 100644 --- a/thirdparty/simde/arm/neon/mull_high.h +++ b/thirdparty/simde/arm/neon/mull_high.h @@ -166,7 +166,7 @@ simde_vmull_high_p64(simde_poly64x2_t a, simde_poly64x2_t b) { return simde_vmull_p64(a_.values[1], b_.values[1]); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_CRYPTO)) #undef vmull_high_p64 #define vmull_high_p64(a, b) simde_vmull_high_p64((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/mulx.h b/thirdparty/simde/arm/neon/mulx.h index a089125f6..a2a4e8ad0 100644 --- a/thirdparty/simde/arm/neon/mulx.h +++ b/thirdparty/simde/arm/neon/mulx.h @@ -46,7 +46,7 @@ simde_vmulxh_f16(simde_float16_t a, simde_float16_t b) { simde_float16_to_float32(b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulxh_f16 #define vmulxh_f16(a, b) simde_vmulxh_f16((a), (b)) #endif @@ -98,7 +98,7 @@ simde_vmulx_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulx_f16 #define vmulx_f16(a, b) simde_vmulx_f16((a), (b)) #endif @@ -178,7 +178,7 @@ simde_vmulxq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulxq_f16 #define vmulxq_f16(a, b) simde_vmulxq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/mulx_lane.h b/thirdparty/simde/arm/neon/mulx_lane.h index eed553651..4ab5ed8d3 100644 --- a/thirdparty/simde/arm/neon/mulx_lane.h +++ b/thirdparty/simde/arm/neon/mulx_lane.h @@ -44,7 +44,7 @@ simde_vmulxh_lane_f16(simde_float16_t a, simde_float16x4_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmulxh_lane_f16(a, b, lane) vmulxh_lane_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulxh_lane_f16 #define vmulxh_lane_f16(a, b, lane) simde_vmulxh_lane_f16(a, b, lane) #endif @@ -58,7 +58,7 @@ simde_vmulxs_lane_f32(simde_float32_t a, simde_float32x2_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vmulxs_lane_f32(a, b, lane) vmulxs_lane_f32((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulxs_lane_f32 #define vmulxs_lane_f32(a, b, lane) simde_vmulxs_lane_f32(a, b, lane) #endif @@ -72,7 +72,7 @@ simde_vmulxd_lane_f64(simde_float64_t a, simde_float64x1_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vmulxd_lane_f64(a, b, lane) vmulxd_lane_f64((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulxd_lane_f64 #define vmulxd_lane_f64(a, b, lane) simde_vmulxd_lane_f64(a, b, lane) #endif @@ -88,7 +88,7 @@ simde_vmulxh_laneq_f16(simde_float16_t a, simde_float16x8_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmulxh_laneq_f16(a, b, lane) vmulxh_laneq_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulxh_laneq_f16 #define vmulxh_laneq_f16(a, b, lane) simde_vmulxh_laneq_f16(a, b, lane) #endif @@ -142,7 +142,7 @@ simde_vmulx_lane_f16(simde_float16x4_t a, simde_float16x4_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmulx_lane_f16(a, b, lane) vmulx_lane_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulx_lane_f16 #define vmulx_lane_f16(a, b, lane) simde_vmulx_lane_f16((a), (b), (lane)) #endif @@ -224,7 +224,7 @@ simde_vmulxq_lane_f16(simde_float16x8_t a, simde_float16x4_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmulxq_lane_f16(a, b, lane) vmulxq_lane_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulxq_lane_f16 #define vmulxq_lane_f16(a, b, lane) simde_vmulxq_lane_f16((a), (b), (lane)) #endif @@ -306,7 +306,7 @@ simde_vmulxq_laneq_f16(simde_float16x8_t a, simde_float16x8_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmulxq_laneq_f16(a, b, lane) vmulxq_laneq_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulxq_laneq_f16 #define vmulxq_laneq_f16(a, b, lane) simde_vmulxq_laneq_f16((a), (b), (lane)) #endif @@ -388,7 +388,7 @@ simde_vmulx_laneq_f16(simde_float16x4_t a, simde_float16x8_t b, const int lane) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vmulx_laneq_f16(a, b, lane) vmulx_laneq_f16((a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulx_laneq_f16 #define vmulx_laneq_f16(a, b, lane) simde_vmulx_laneq_f16((a), (b), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/mulx_n.h b/thirdparty/simde/arm/neon/mulx_n.h index be78a834d..efb10f644 100644 --- a/thirdparty/simde/arm/neon/mulx_n.h +++ b/thirdparty/simde/arm/neon/mulx_n.h @@ -44,7 +44,7 @@ simde_vmulx_n_f16(simde_float16x4_t a, simde_float16 b) { return simde_vmul_f16(a, simde_vdup_n_f16(b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulx_n_f16 #define vmulx_n_f16(a, b) simde_vmulx_n_f16((a), (b)) #endif @@ -58,7 +58,7 @@ simde_vmulxq_n_f16(simde_float16x8_t a, simde_float16 b) { return simde_vmulq_f16(a, simde_vdupq_n_f16(b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vmulxq_n_f16 #define vmulxq_n_f16(a, b) simde_vmulxq_n_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/mvn.h b/thirdparty/simde/arm/neon/mvn.h index 7ded6b502..c7ed99ffb 100644 --- a/thirdparty/simde/arm/neon/mvn.h +++ b/thirdparty/simde/arm/neon/mvn.h @@ -55,10 +55,10 @@ simde_vmvnq_s8(simde_int8x16_t a) { r_.m128i = _mm_andnot_si128(a_.m128i, _mm_cmpeq_epi8(a_.m128i, a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_not(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_i8m1(a_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv128 = __riscv_vnot_v_i8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -92,10 +92,10 @@ simde_vmvnq_s16(simde_int16x8_t a) { r_.m128i = _mm_andnot_si128(a_.m128i, _mm_cmpeq_epi16(a_.m128i, a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_not(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_i16m1(a_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv128 = __riscv_vnot_v_i16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -129,10 +129,10 @@ simde_vmvnq_s32(simde_int32x4_t a) { r_.m128i = _mm_andnot_si128(a_.m128i, _mm_cmpeq_epi32(a_.m128i, a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_not(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_i32m1(a_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv128 = __riscv_vnot_v_i32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -166,10 +166,10 @@ simde_vmvnq_u8(simde_uint8x16_t a) { r_.m128i = _mm_andnot_si128(a_.m128i, _mm_cmpeq_epi8(a_.m128i, a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_not(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_u8m1(a_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv128 = __riscv_vnot_v_u8m1(a_.sv128, b_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -203,10 +203,10 @@ simde_vmvnq_u16(simde_uint16x8_t a) { r_.m128i = _mm_andnot_si128(a_.m128i, _mm_cmpeq_epi16(a_.m128i, a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_not(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_u16m1(a_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv128 = __riscv_vnot_v_u16m1(a_.sv128, b_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -240,10 +240,10 @@ simde_vmvnq_u32(simde_uint32x4_t a) { r_.m128i = _mm_andnot_si128(a_.m128i, _mm_cmpeq_epi32(a_.m128i, a_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_not(a_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vnot_v_u32m1(a_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv128 = __riscv_vnot_v_u32m1(a_.sv128, b_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -271,10 +271,10 @@ simde_vmvn_s8(simde_int8x8_t a) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi8(a_.m64, a_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_i8m1(a_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv64 = __riscv_vnot_v_i8m1(a_.sv64, b_.sv64, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -302,10 +302,10 @@ simde_vmvn_s16(simde_int16x4_t a) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi16(a_.m64, a_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_i16m1(a_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv64 = __riscv_vnot_v_i16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -333,10 +333,10 @@ simde_vmvn_s32(simde_int32x2_t a) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi32(a_.m64, a_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_i32m1(a_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv64 = __riscv_vnot_v_i32m1(a_.sv64, b_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -364,10 +364,10 @@ simde_vmvn_u8(simde_uint8x8_t a) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi8(a_.m64, a_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_u8m1(a_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv64 = __riscv_vnot_v_u8m1(a_.sv64, b_.sv64, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -395,10 +395,10 @@ simde_vmvn_u16(simde_uint16x4_t a) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi16(a_.m64, a_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_u16m1(a_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv64 = __riscv_vnot_v_u16m1(a_.sv64, b_.sv64, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -426,10 +426,10 @@ simde_vmvn_u32(simde_uint32x2_t a) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_andnot_si64(a_.m64, _mm_cmpeq_pi32(a_.m64, a_.m64)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vnot_v_u32m1(a_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = ~a_.values; - #elif defined(SIMDE_RISCV_V_NATIVE) - r_.sv64 = __riscv_vnot_v_u32m1(a_.sv64, b_.sv64, 2); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -457,7 +457,7 @@ simde_vmvn_p8(simde_poly8x8_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = ~(a_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(simde_poly8, ~(a_.values[i])); } return simde_poly8x8_from_private(r_); @@ -480,7 +480,7 @@ simde_vmvnq_p8(simde_poly8x16_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = ~(a_.values[i]); + r_.values[i] = HEDLEY_STATIC_CAST(simde_poly8, ~(a_.values[i])); } return simde_poly8x16_from_private(r_); diff --git a/thirdparty/simde/arm/neon/neg.h b/thirdparty/simde/arm/neon/neg.h index e6b2a8e48..2c4c4bb04 100644 --- a/thirdparty/simde/arm/neon/neg.h +++ b/thirdparty/simde/arm/neon/neg.h @@ -57,7 +57,7 @@ simde_vnegh_f16(simde_float16_t a) { return simde_float16_from_float32(-simde_float16_to_float32(a)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vnegh_f16 #define vnegh_f16(a) simde_vnegh_f16(a) #endif @@ -80,7 +80,7 @@ simde_vneg_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vneg_f16 #define vneg_f16(a) simde_vneg_f16(a) #endif @@ -265,7 +265,7 @@ simde_vnegq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vnegq_f16 #define vnegq_f16(a) simde_vnegq_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/padd.h b/thirdparty/simde/arm/neon/padd.h index 5c34cbe89..11186a1d7 100644 --- a/thirdparty/simde/arm/neon/padd.h +++ b/thirdparty/simde/arm/neon/padd.h @@ -106,7 +106,7 @@ simde_vpadd_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_vadd_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0) && defined(SIMDE_ARM_NEON_FP16))) #undef vpadd_f16 #define vpadd_f16(a, b) simde_vpadd_f16((a), (b)) #endif @@ -120,7 +120,7 @@ simde_vpadd_f32(simde_float32x2_t a, simde_float32x2_t b) { return simde_vadd_f32(simde_vuzp1_f32(a, b), simde_vuzp2_f32(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0))) #undef vpadd_f32 #define vpadd_f32(a, b) simde_vpadd_f32((a), (b)) #endif @@ -222,7 +222,7 @@ simde_vpaddq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_vaddq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vpaddq_f16 #define vpaddq_f16(a, b) simde_vpaddq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/pmax.h b/thirdparty/simde/arm/neon/pmax.h index d8de39d76..249998212 100644 --- a/thirdparty/simde/arm/neon/pmax.h +++ b/thirdparty/simde/arm/neon/pmax.h @@ -77,7 +77,7 @@ simde_vpmax_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_vmax_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vpmax_f16 #define vpmax_f16(a, b) simde_vpmax_f16((a), (b)) #endif @@ -189,7 +189,7 @@ simde_vpmaxq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_vmaxq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vpmaxq_f16 #define vpmaxq_f16(a, b) simde_vpmaxq_f16((a), (b)) #endif @@ -301,7 +301,7 @@ simde_vpmaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { return simde_vmaxq_u32(simde_vuzp1q_u32(a, b), simde_vuzp2q_u32(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vpmaxq_u32 #define vpmaxq_u32(a, b) simde_vpmaxq_u32((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/pmaxnm.h b/thirdparty/simde/arm/neon/pmaxnm.h index 5fa519d5e..b85f56e0a 100644 --- a/thirdparty/simde/arm/neon/pmaxnm.h +++ b/thirdparty/simde/arm/neon/pmaxnm.h @@ -75,7 +75,7 @@ simde_vpmaxnm_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_vmax_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vpmaxnm_f16 #define vpmaxnm_f16(a, b) simde_vpmaxnm_f16((a), (b)) #endif @@ -103,7 +103,7 @@ simde_vpmaxnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_vmaxq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vpmaxnmq_f16 #define vpmaxnmq_f16(a, b) simde_vpmaxnmq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/pmin.h b/thirdparty/simde/arm/neon/pmin.h index 2f76c6380..640ca5677 100644 --- a/thirdparty/simde/arm/neon/pmin.h +++ b/thirdparty/simde/arm/neon/pmin.h @@ -76,7 +76,7 @@ simde_vpmin_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_vmin_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vpmin_f16 #define vpmin_f16(a, b) simde_vpmin_f16((a), (b)) #endif @@ -188,7 +188,7 @@ simde_vpminq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_vminq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vpminq_f16 #define vpminq_f16(a, b) simde_vpminq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/pminnm.h b/thirdparty/simde/arm/neon/pminnm.h index 99de03555..36eae0f15 100644 --- a/thirdparty/simde/arm/neon/pminnm.h +++ b/thirdparty/simde/arm/neon/pminnm.h @@ -75,7 +75,7 @@ simde_vpminnm_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_vmin_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vpminnm_f16 #define vpminnm_f16(a, b) simde_vpminnm_f16((a), (b)) #endif @@ -103,7 +103,7 @@ simde_vpminnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_vminq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vpminnmq_f16 #define vpminnmq_f16(a, b) simde_vpminnmq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/qabs.h b/thirdparty/simde/arm/neon/qabs.h index 9ad7d7c83..1582af74b 100644 --- a/thirdparty/simde/arm/neon/qabs.h +++ b/thirdparty/simde/arm/neon/qabs.h @@ -47,7 +47,7 @@ simde_vqabsb_s8(int8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqabsb_s8(a); #else - return a == INT8_MIN ? INT8_MAX : (a < 0 ? -a : a); + return a == INT8_MIN ? INT8_MAX : (a < 0 ? HEDLEY_STATIC_CAST(int8_t, -a) : a); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -61,7 +61,7 @@ simde_vqabsh_s16(int16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqabsh_s16(a); #else - return a == INT16_MIN ? INT16_MAX : (a < 0 ? -a : a); + return a == INT16_MIN ? INT16_MAX : (a < 0 ? HEDLEY_STATIC_CAST(int16_t, -a) : a); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/thirdparty/simde/arm/neon/qdmulh.h b/thirdparty/simde/arm/neon/qdmulh.h index 29d1078cb..1e879dca9 100644 --- a/thirdparty/simde/arm/neon/qdmulh.h +++ b/thirdparty/simde/arm/neon/qdmulh.h @@ -53,7 +53,7 @@ simde_vqdmulhs_s32(int32_t a, int32_t b) { } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vqdmulhs_s32 - #define vqdmulhs_s32(a) simde_vqdmulhs_s32((a)) + #define vqdmulhs_s32(a, b) simde_vqdmulhs_s32((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -64,7 +64,7 @@ simde_vqdmulh_s16(simde_int16x4_t a, simde_int16x4_t b) { #else simde_int16x4_private r_; - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !(HEDLEY_GCC_VERSION_CHECK(12,1,0) && defined(SIMDE_ARCH_ZARCH)) + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(SIMDE_ARCH_ZARCH) simde_int16x8_private tmp_ = simde_int16x8_to_private( simde_vreinterpretq_s16_s32( @@ -113,7 +113,7 @@ simde_vqdmulh_s32(simde_int32x2_t a, simde_int32x2_t b) { #else simde_int32x2_private r_; - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !(HEDLEY_GCC_VERSION_CHECK(12,1,0) && defined(SIMDE_ARCH_ZARCH)) + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(SIMDE_ARCH_ZARCH) simde_int32x4_private tmp_ = simde_int32x4_to_private( simde_vreinterpretq_s32_s64( diff --git a/thirdparty/simde/arm/neon/qneg.h b/thirdparty/simde/arm/neon/qneg.h index cd5e4ec97..e39b88e93 100644 --- a/thirdparty/simde/arm/neon/qneg.h +++ b/thirdparty/simde/arm/neon/qneg.h @@ -45,7 +45,7 @@ simde_vqnegb_s8(int8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqnegb_s8(a); #else - return a == INT8_MIN ? INT8_MAX : -a; + return a == INT8_MIN ? INT8_MAX : HEDLEY_STATIC_CAST(int8_t, -a); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -59,7 +59,7 @@ simde_vqnegh_s16(int16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqnegh_s16(a); #else - return a == INT16_MIN ? INT16_MAX : -a; + return a == INT16_MIN ? INT16_MAX : HEDLEY_STATIC_CAST(int16_t, -a); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -109,7 +109,7 @@ simde_vqneg_s8(simde_int8x8_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] == INT8_MIN) ? INT8_MAX : -(a_.values[i]); + r_.values[i] = (a_.values[i] == INT8_MIN) ? INT8_MAX : HEDLEY_STATIC_CAST(int8_t, -(a_.values[i])); } return simde_int8x8_from_private(r_); @@ -134,7 +134,7 @@ simde_vqneg_s16(simde_int16x4_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] == INT16_MIN) ? INT16_MAX : -(a_.values[i]); + r_.values[i] = (a_.values[i] == INT16_MIN) ? INT16_MAX : HEDLEY_STATIC_CAST(int16_t, -(a_.values[i])); } return simde_int16x4_from_private(r_); @@ -209,7 +209,7 @@ simde_vqnegq_s8(simde_int8x16_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] == INT8_MIN) ? INT8_MAX : -(a_.values[i]); + r_.values[i] = (a_.values[i] == INT8_MIN) ? INT8_MAX : HEDLEY_STATIC_CAST(int8_t, -(a_.values[i])); } return simde_int8x16_from_private(r_); @@ -234,7 +234,7 @@ simde_vqnegq_s16(simde_int16x8_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] == INT16_MIN) ? INT16_MAX : -(a_.values[i]); + r_.values[i] = (a_.values[i] == INT16_MIN) ? INT16_MAX : HEDLEY_STATIC_CAST(int16_t, -(a_.values[i])); } return simde_int16x8_from_private(r_); diff --git a/thirdparty/simde/arm/neon/qrdmlah.h b/thirdparty/simde/arm/neon/qrdmlah.h index 9442101e3..b74f40db0 100644 --- a/thirdparty/simde/arm/neon/qrdmlah.h +++ b/thirdparty/simde/arm/neon/qrdmlah.h @@ -48,7 +48,7 @@ simde_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) { return simde_vqmovns_s32(HEDLEY_STATIC_CAST(int32_t, r)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahh_s16 #define vqrdmlahh_s16(a, b, c) simde_vqrdmlahh_s16((a), (b), (c)) #endif @@ -74,7 +74,7 @@ simde_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) { return HEDLEY_STATIC_CAST(int32_t, ((sum2 >> 32) & 0xffffffff)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahs_s32 #define vqrdmlahs_s32(a, b, c) simde_vqrdmlahs_s32((a), (b), (c)) #endif @@ -100,7 +100,7 @@ simde_vqrdmlah_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { return simde_int16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlah_s16 #define vqrdmlah_s16(a, b, c) simde_vqrdmlah_s16((a), (b), (c)) #endif @@ -125,7 +125,7 @@ simde_vqrdmlah_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { return simde_int32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlah_s32 #define vqrdmlah_s32(a, b, c) simde_vqrdmlah_s32((a), (b), (c)) #endif @@ -150,7 +150,7 @@ simde_vqrdmlahq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { return simde_int16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahq_s16 #define vqrdmlahq_s16(a, b, c) simde_vqrdmlahq_s16((a), (b), (c)) #endif @@ -175,7 +175,7 @@ simde_vqrdmlahq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { return simde_int32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahq_s32 #define vqrdmlahq_s32(a, b, c) simde_vqrdmlahq_s32((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/qrdmlah_lane.h b/thirdparty/simde/arm/neon/qrdmlah_lane.h index 4f18bbb5f..61ed84934 100644 --- a/thirdparty/simde/arm/neon/qrdmlah_lane.h +++ b/thirdparty/simde/arm/neon/qrdmlah_lane.h @@ -41,7 +41,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlahh_lane_s16(a, b, v, lane) simde_vqrdmlahh_s16((a), (b), simde_vget_lane_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahh_lane_s16 #define vqrdmlahh_lane_s16(a, b, v, lane) simde_vqrdmlahh_lane_s16((a), (b), (v), (lane)) #endif @@ -51,7 +51,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlahh_laneq_s16(a, b, v, lane) simde_vqrdmlahh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahh_laneq_s16 #define vqrdmlahh_laneq_s16(a, b, v, lane) simde_vqrdmlahh_laneq_s16((a), (b), (v), (lane)) #endif @@ -61,7 +61,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlahs_lane_s32(a, b, v, lane) simde_vqrdmlahs_s32((a), (b), simde_vget_lane_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahs_lane_s32 #define vqrdmlahs_lane_s32(a, b, v, lane) simde_vqrdmlahs_lane_s32((a), (b), (v), (lane)) #endif @@ -71,7 +71,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlahs_laneq_s32(a, b, v, lane) simde_vqrdmlahs_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahs_laneq_s32 #define vqrdmlahs_laneq_s32(a, b, v, lane) simde_vqrdmlahs_laneq_s32((a), (b), (v), (lane)) #endif @@ -81,7 +81,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlah_lane_s16(a, b, v, lane) simde_vqrdmlah_s16((a), (b), simde_vdup_lane_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlah_lane_s16 #define vqrdmlah_lane_s16(a, b, v, lane) simde_vqrdmlah_lane_s16((a), (b), (v), (lane)) #endif @@ -91,7 +91,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlah_lane_s32(a, b, v, lane) simde_vqrdmlah_s32((a), (b), simde_vdup_lane_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlah_lane_s32 #define vqrdmlah_lane_s32(a, b, v, lane) simde_vqrdmlah_lane_s32((a), (b), (v), (lane)) #endif @@ -101,7 +101,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlahq_lane_s16(a, b, v, lane) simde_vqrdmlahq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahq_lane_s16 #define vqrdmlahq_lane_s16(a, b, v, lane) simde_vqrdmlahq_lane_s16((a), (b), (v), (lane)) #endif @@ -111,7 +111,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlahq_lane_s32(a, b, v, lane) simde_vqrdmlahq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahq_lane_s32 #define vqrdmlahq_lane_s32(a, b, v, lane) simde_vqrdmlahq_lane_s32((a), (b), (v), (lane)) #endif @@ -121,7 +121,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlah_laneq_s16(a, b, v, lane) simde_vqrdmlah_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlah_laneq_s16 #define vqrdmlah_laneq_s16(a, b, v, lane) simde_vqrdmlah_laneq_s16((a), (b), (v), (lane)) #endif @@ -131,7 +131,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlah_laneq_s32(a, b, v, lane) simde_vqrdmlah_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlah_laneq_s32 #define vqrdmlah_laneq_s32(a, b, v, lane) simde_vqrdmlah_laneq_s32((a), (b), (v), (lane)) #endif @@ -141,7 +141,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlahq_laneq_s16(a, b, v, lane) simde_vqrdmlahq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahq_laneq_s16 #define vqrdmlahq_laneq_s16(a, b, v, lane) simde_vqrdmlahq_laneq_s16((a), (b), (v), (lane)) #endif @@ -151,7 +151,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlahq_laneq_s32(a, b, v, lane) simde_vqrdmlahq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlahq_laneq_s32 #define vqrdmlahq_laneq_s32(a, b, v, lane) simde_vqrdmlahq_laneq_s32((a), (b), (v), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/qrdmlsh.h b/thirdparty/simde/arm/neon/qrdmlsh.h index eb0be8e7c..71cd1d015 100644 --- a/thirdparty/simde/arm/neon/qrdmlsh.h +++ b/thirdparty/simde/arm/neon/qrdmlsh.h @@ -48,7 +48,7 @@ simde_vqrdmlshh_s16(int16_t a, int16_t b, int16_t c) { return simde_vqmovns_s32(HEDLEY_STATIC_CAST(int32_t, r)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshh_s16 #define vqrdmlshh_s16(a, b, c) simde_vqrdmlshh_s16((a), (b), (c)) #endif @@ -74,7 +74,7 @@ simde_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) { return HEDLEY_STATIC_CAST(int32_t, ((sum2 >> 32) & 0xffffffff)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshs_s32 #define vqrdmlshs_s32(a, b, c) simde_vqrdmlshs_s32((a), (b), (c)) #endif @@ -100,7 +100,7 @@ simde_vqrdmlsh_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { return simde_int16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlsh_s16 #define vqrdmlsh_s16(a, b, c) simde_vqrdmlsh_s16((a), (b), (c)) #endif @@ -125,7 +125,7 @@ simde_vqrdmlsh_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { return simde_int32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlsh_s32 #define vqrdmlsh_s32(a, b, c) simde_vqrdmlsh_s32((a), (b), (c)) #endif @@ -150,7 +150,7 @@ simde_vqrdmlshq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { return simde_int16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshq_s16 #define vqrdmlshq_s16(a, b, c) simde_vqrdmlshq_s16((a), (b), (c)) #endif @@ -175,7 +175,7 @@ simde_vqrdmlshq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { return simde_int32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshq_s32 #define vqrdmlshq_s32(a, b, c) simde_vqrdmlshq_s32((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/qrdmlsh_lane.h b/thirdparty/simde/arm/neon/qrdmlsh_lane.h index a9584c601..b40a39a9d 100644 --- a/thirdparty/simde/arm/neon/qrdmlsh_lane.h +++ b/thirdparty/simde/arm/neon/qrdmlsh_lane.h @@ -41,7 +41,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlshh_lane_s16(a, b, v, lane) simde_vqrdmlshh_s16((a), (b), simde_vget_lane_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshh_lane_s16 #define vqrdmlshh_lane_s16(a, b, v, lane) simde_vqrdmlshh_lane_s16((a), (b), (v), (lane)) #endif @@ -51,7 +51,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlshh_laneq_s16(a, b, v, lane) simde_vqrdmlshh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshh_laneq_s16 #define vqrdmlshh_laneq_s16(a, b, v, lane) simde_vqrdmlshh_laneq_s16((a), (b), (v), (lane)) #endif @@ -61,7 +61,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlshs_lane_s32(a, b, v, lane) simde_vqrdmlshs_s32((a), (b), simde_vget_lane_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshs_lane_s32 #define vqrdmlshs_lane_s32(a, b, v, lane) simde_vqrdmlshs_lane_s32((a), (b), (v), (lane)) #endif @@ -71,7 +71,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlshs_laneq_s32(a, b, v, lane) simde_vqrdmlshs_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshs_laneq_s32 #define vqrdmlshs_laneq_s32(a, b, v, lane) simde_vqrdmlshs_laneq_s32((a), (b), (v), (lane)) #endif @@ -81,7 +81,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlsh_lane_s16(a, b, v, lane) simde_vqrdmlsh_s16((a), (b), simde_vdup_lane_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlsh_lane_s16 #define vqrdmlsh_lane_s16(a, b, v, lane) simde_vqrdmlsh_lane_s16((a), (b), (v), (lane)) #endif @@ -91,7 +91,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlsh_lane_s32(a, b, v, lane) simde_vqrdmlsh_s32((a), (b), simde_vdup_lane_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlsh_lane_s32 #define vqrdmlsh_lane_s32(a, b, v, lane) simde_vqrdmlsh_lane_s32((a), (b), (v), (lane)) #endif @@ -101,7 +101,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlshq_lane_s16(a, b, v, lane) simde_vqrdmlshq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshq_lane_s16 #define vqrdmlshq_lane_s16(a, b, v, lane) simde_vqrdmlshq_lane_s16((a), (b), (v), (lane)) #endif @@ -111,7 +111,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlshq_lane_s32(a, b, v, lane) simde_vqrdmlshq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshq_lane_s32 #define vqrdmlshq_lane_s32(a, b, v, lane) simde_vqrdmlshq_lane_s32((a), (b), (v), (lane)) #endif @@ -121,7 +121,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlsh_laneq_s16(a, b, v, lane) simde_vqrdmlsh_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlsh_laneq_s16 #define vqrdmlsh_laneq_s16(a, b, v, lane) simde_vqrdmlsh_laneq_s16((a), (b), (v), (lane)) #endif @@ -131,7 +131,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlsh_laneq_s32(a, b, v, lane) simde_vqrdmlsh_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlsh_laneq_s32 #define vqrdmlsh_laneq_s32(a, b, v, lane) simde_vqrdmlsh_laneq_s32((a), (b), (v), (lane)) #endif @@ -141,7 +141,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlshq_laneq_s16(a, b, v, lane) simde_vqrdmlshq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshq_laneq_s16 #define vqrdmlshq_laneq_s16(a, b, v, lane) simde_vqrdmlshq_laneq_s16((a), (b), (v), (lane)) #endif @@ -151,7 +151,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vqrdmlshq_laneq_s32(a, b, v, lane) simde_vqrdmlshq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_QRDMX))) #undef vqrdmlshq_laneq_s32 #define vqrdmlshq_laneq_s32(a, b, v, lane) simde_vqrdmlshq_laneq_s32((a), (b), (v), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/qrdmulh.h b/thirdparty/simde/arm/neon/qrdmulh.h index 55fedfe72..12e16a146 100644 --- a/thirdparty/simde/arm/neon/qrdmulh.h +++ b/thirdparty/simde/arm/neon/qrdmulh.h @@ -128,29 +128,8 @@ simde_vqrdmulhq_s16(simde_int16x8_t a, simde_int16x8_t b) { a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b); - /* https://github.com/WebAssembly/simd/pull/365 */ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vqrdmulhq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - __m128i y = _mm_mulhrs_epi16(a_.m128i, b_.m128i); - __m128i tmp = _mm_cmpeq_epi16(y, _mm_set1_epi16(INT16_MAX)); - r_.m128i = _mm_xor_si128(y, tmp); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i prod_lo = _mm_mullo_epi16(a_.m128i, b_.m128i); - const __m128i prod_hi = _mm_mulhi_epi16(a_.m128i, b_.m128i); - const __m128i tmp = - _mm_add_epi16( - _mm_avg_epu16( - _mm_srli_epi16(prod_lo, 14), - _mm_setzero_si128() - ), - _mm_add_epi16(prod_hi, prod_hi) - ); - r_.m128i = - _mm_xor_si128( - tmp, - _mm_cmpeq_epi16(_mm_set1_epi16(INT16_MAX), tmp) - ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/thirdparty/simde/arm/neon/qrshl.h b/thirdparty/simde/arm/neon/qrshl.h index e91435619..74b8f47de 100644 --- a/thirdparty/simde/arm/neon/qrshl.h +++ b/thirdparty/simde/arm/neon/qrshl.h @@ -207,7 +207,7 @@ simde_vqrshlb_u8(uint8_t a, int8_t b) { r = (a >> -b) + ((a >> (-b - 1)) & 1); } else if (b == 0) { r = a; - } else if (b < 7) { + } else if (b < 8) { r = HEDLEY_STATIC_CAST(uint8_t, a << b); if ((r >> b) != a) { r = UINT8_MAX; @@ -250,7 +250,7 @@ simde_vqrshlh_u16(uint16_t a, int16_t b) { r = (a >> -b) + ((a >> (-b - 1)) & 1); } else if (b == 0) { r = a; - } else if (b < 15) { + } else if (b < 16) { r = HEDLEY_STATIC_CAST(uint16_t, a << b); if ((r >> b) != a) { r = UINT16_MAX; @@ -290,10 +290,13 @@ simde_vqrshls_u32(uint32_t a, int32_t b) { if (b < -32) { r = 0; } else if (b < 0) { - r = (a >> -b) + ((a >> (-b - 1)) & 1); + if (b == -32) + r = (a >> 31) & 1; + else + r = (a >> -b) + ((a >> (-b - 1)) & 1); } else if (b == 0) { r = a; - } else if (b < 31) { + } else if (b < 32) { r = HEDLEY_STATIC_CAST(uint32_t, a << b); if ((r >> b) != a) { r = UINT32_MAX; @@ -333,10 +336,13 @@ simde_vqrshld_u64(uint64_t a, int64_t b) { if (b < -64) { r = 0; } else if (b < 0) { - r = (a >> -b) + ((a >> (-b - 1)) & 1); + if (b == -64) + r = (a >> 63) & 1; + else + r = (a >> -b) + ((a >> (-b - 1)) & 1); } else if (b == 0) { r = a; - } else if (b < 63) { + } else if (b < 64) { r = HEDLEY_STATIC_CAST(uint64_t, a << b); if ((r >> b) != a) { r = UINT64_MAX; diff --git a/thirdparty/simde/arm/neon/qrshrun_high_n.h b/thirdparty/simde/arm/neon/qrshrun_high_n.h index b035681c3..a06abe776 100644 --- a/thirdparty/simde/arm/neon/qrshrun_high_n.h +++ b/thirdparty/simde/arm/neon/qrshrun_high_n.h @@ -54,7 +54,7 @@ simde_vqrshrun_high_n_s16(simde_uint8x8_t r, simde_int16x8_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__) #define simde_vqrshrun_high_n_s16(r, a, n) vqrshrun_high_n_s16((r), (a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(__clang__)) #undef vqrshrun_high_n_s16 #define vqrshrun_high_n_s16(r, a, n) simde_vqrshrun_high_n_s16((r), (a), (n)) #endif @@ -78,7 +78,7 @@ simde_vqrshrun_high_n_s32(simde_uint16x4_t r, simde_int32x4_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__) #define simde_vqrshrun_high_n_s32(r, a, n) vqrshrun_high_n_s32((r), (a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(__clang__)) #undef vqrshrun_high_n_s32 #define vqrshrun_high_n_s32(r, a, n) simde_vqrshrun_high_n_s32((r), (a), (n)) #endif @@ -102,7 +102,7 @@ simde_vqrshrun_high_n_s64(simde_uint32x2_t r, simde_int64x2_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__) #define simde_vqrshrun_high_n_s64(r, a, n) vqrshrun_high_n_s64((r), (a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(__clang__)) #undef vqrshrun_high_n_s64 #define vqrshrun_high_n_s64(r, a, n) simde_vqrshrun_high_n_s64((r), (a), (n)) #endif diff --git a/thirdparty/simde/arm/neon/qshrun_high_n.h b/thirdparty/simde/arm/neon/qshrun_high_n.h index c30368600..acea87463 100644 --- a/thirdparty/simde/arm/neon/qshrun_high_n.h +++ b/thirdparty/simde/arm/neon/qshrun_high_n.h @@ -54,7 +54,7 @@ simde_vqshrun_high_n_s16(simde_uint8x8_t r, simde_int16x8_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71365) #define simde_vqshrun_high_n_s16(r, a, n) vqshrun_high_n_s16((r), (a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(SIMDE_BUG_CLANG_71365)) #undef vqshrun_high_n_s16 #define vqshrun_high_n_s16(r, a, n) simde_vqshrun_high_n_s16((r), (a), (n)) #endif @@ -78,7 +78,7 @@ simde_vqshrun_high_n_s32(simde_uint16x4_t r, simde_int32x4_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71365) #define simde_vqshrun_high_n_s32(r, a, n) vqshrun_high_n_s32((r), (a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(SIMDE_BUG_CLANG_71365)) #undef vqshrun_high_n_s32 #define vqshrun_high_n_s32(r, a, n) simde_vqshrun_high_n_s32((r), (a), (n)) #endif @@ -102,7 +102,7 @@ simde_vqshrun_high_n_s64(simde_uint32x2_t r, simde_int64x2_t a, const int n) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71365) #define simde_vqshrun_high_n_s64(r, a, n) vqshrun_high_n_s64((r), (a), (n)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && defined(SIMDE_BUG_CLANG_71365)) #undef vqshrun_high_n_s64 #define vqshrun_high_n_s64(r, a, n) simde_vqshrun_high_n_s64((r), (a), (n)) #endif diff --git a/thirdparty/simde/arm/neon/rax.h b/thirdparty/simde/arm/neon/rax.h index 052e9caf6..a83576a48 100644 --- a/thirdparty/simde/arm/neon/rax.h +++ b/thirdparty/simde/arm/neon/rax.h @@ -53,7 +53,7 @@ simde_vrax1q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { return simde_uint64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vrax1q_u64 #define vrax1q_u64(a, b) simde_vrax1q_u64((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/recpe.h b/thirdparty/simde/arm/neon/recpe.h index be068a06c..b7a8b11f8 100644 --- a/thirdparty/simde/arm/neon/recpe.h +++ b/thirdparty/simde/arm/neon/recpe.h @@ -48,7 +48,7 @@ simde_vrecpeh_f16(simde_float16_t a) { return simde_float16_from_float32(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrecpeh_f16 #define vrecpeh_f16(a) simde_vrecpeh_f16((a)) #endif @@ -103,7 +103,7 @@ simde_vrecpe_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrecpe_f16 #define vrecpe_f16(a) simde_vrecpe_f16((a)) #endif @@ -275,7 +275,7 @@ simde_vrecpeq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrecpeq_f16 #define vrecpeq_f16(a) simde_vrecpeq_f16((a)) #endif diff --git a/thirdparty/simde/arm/neon/recps.h b/thirdparty/simde/arm/neon/recps.h index 9d1f7ecc9..d0f06fcd4 100644 --- a/thirdparty/simde/arm/neon/recps.h +++ b/thirdparty/simde/arm/neon/recps.h @@ -46,7 +46,7 @@ simde_vrecpsh_f16(simde_float16_t a, simde_float16_t b) { simde_float16_to_float32(a) * simde_float16_to_float32(b)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrecpsh_f16 #define vrecpsh_f16(a, b) simde_vrecpsh_f16((a), (b)) #endif @@ -112,7 +112,7 @@ simde_vrecps_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrecps_f16 #define vrecps_f16(a, b) simde_vrecps_f16((a), (b)) #endif @@ -178,7 +178,7 @@ simde_vrecpsq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrecpsq_f16 #define vrecpsq_f16(a, b) simde_vrecpsq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/recpx.h b/thirdparty/simde/arm/neon/recpx.h index c1a36f650..fede73dd9 100644 --- a/thirdparty/simde/arm/neon/recpx.h +++ b/thirdparty/simde/arm/neon/recpx.h @@ -61,7 +61,7 @@ simde_vrecpxh_f16(simde_float16_t a) { return a; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrecpxh_f16 #define vrecpxh_f16(a) simde_vrecpxh_f16((a)) #endif diff --git a/thirdparty/simde/arm/neon/reinterpret.h b/thirdparty/simde/arm/neon/reinterpret.h index 3af62f773..4a9f9c33c 100644 --- a/thirdparty/simde/arm/neon/reinterpret.h +++ b/thirdparty/simde/arm/neon/reinterpret.h @@ -1697,7 +1697,8 @@ simde_vreinterpret_u16_f16(simde_float16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_u16_f16 #define vreinterpret_u16_f16(a) simde_vreinterpret_u16_f16(a) #endif @@ -2173,7 +2174,8 @@ simde_vreinterpretq_u16_f16(simde_float16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_u16_f16 #define vreinterpretq_u16_f16(a) simde_vreinterpretq_u16_f16(a) #endif @@ -2343,7 +2345,8 @@ simde_vreinterpret_u64_f16(simde_float16x4_t a) { return simde_uint64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_u64_f16 #define vreinterpret_u64_f16 simde_vreinterpret_u64_f16 #endif @@ -2649,7 +2652,8 @@ simde_vreinterpret_f16_u16(simde_uint16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_u16 #define vreinterpret_f16_u16(a) simde_vreinterpret_f16_u16(a) #endif @@ -2820,7 +2824,8 @@ simde_vreinterpretq_f16_u16(simde_uint16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_u16 #define vreinterpretq_f16_u16(a) simde_vreinterpretq_f16_u16(a) #endif @@ -3194,7 +3199,8 @@ simde_vreinterpret_f16_f32(simde_float32x2_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_f32 #define vreinterpret_f16_f32 simde_vreinterpret_f16_f32 #endif @@ -3211,7 +3217,8 @@ simde_vreinterpret_f16_s16(simde_int16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_s16 #define vreinterpret_f16_s16 simde_vreinterpret_f16_s16 #endif @@ -3228,7 +3235,8 @@ simde_vreinterpret_f16_s32(simde_int32x2_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_s32 #define vreinterpret_f16_s32 simde_vreinterpret_f16_s32 #endif @@ -3245,7 +3253,8 @@ simde_vreinterpret_f16_s64(simde_int64x1_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_s64 #define vreinterpret_f16_s64 simde_vreinterpret_f16_s64 #endif @@ -3262,7 +3271,8 @@ simde_vreinterpret_f16_s8(simde_int8x8_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_s8 #define vreinterpret_f16_s8 simde_vreinterpret_f16_s8 #endif @@ -3279,7 +3289,8 @@ simde_vreinterpret_f16_u32(simde_uint32x2_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_u32 #define vreinterpret_f16_u32 simde_vreinterpret_f16_u32 #endif @@ -3296,7 +3307,8 @@ simde_vreinterpret_f16_u64(simde_uint64x1_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_u64 #define vreinterpret_f16_u64 simde_vreinterpret_f16_u64 #endif @@ -3313,7 +3325,8 @@ simde_vreinterpret_f16_u8(simde_uint8x8_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_u8 #define vreinterpret_f16_u8 simde_vreinterpret_f16_u8 #endif @@ -3330,7 +3343,8 @@ simde_vreinterpretq_f16_f32(simde_float32x4_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_f32 #define vreinterpretq_f16_f32(a) simde_vreinterpretq_f16_f32(a) #endif @@ -3347,7 +3361,8 @@ simde_vreinterpretq_f16_s16(simde_int16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_s16 #define vreinterpretq_f16_s16(a) simde_vreinterpretq_f16_s16(a) #endif @@ -3364,7 +3379,8 @@ simde_vreinterpretq_f16_s32(simde_int32x4_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_s32 #define vreinterpretq_f16_s32(a) simde_vreinterpretq_f16_s32(a) #endif @@ -3381,7 +3397,8 @@ simde_vreinterpretq_f16_s64(simde_int64x2_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_s64 #define vreinterpretq_f16_s64(a) simde_vreinterpretq_f16_s64(a) #endif @@ -3398,7 +3415,8 @@ simde_vreinterpretq_f16_s8(simde_int8x16_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_s8 #define vreinterpretq_f16_s8(a) simde_vreinterpretq_f16_s8(a) #endif @@ -3415,7 +3433,8 @@ simde_vreinterpretq_f16_u32(simde_uint32x4_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_u32 #define vreinterpretq_f16_u32(a) simde_vreinterpretq_f16_u32(a) #endif @@ -3432,7 +3451,8 @@ simde_vreinterpretq_f16_u64(simde_uint64x2_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_u64 #define vreinterpretq_f16_u64(a) simde_vreinterpretq_f16_u64(a) #endif @@ -3449,7 +3469,8 @@ simde_vreinterpretq_f16_u8(simde_uint8x16_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_u8 #define vreinterpretq_f16_u8(a) simde_vreinterpretq_f16_u8(a) #endif @@ -3466,7 +3487,8 @@ simde_vreinterpret_f16_f64(simde_float64x1_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_f64 #define vreinterpret_f16_f64 simde_vreinterpret_f16_f64 #endif @@ -3483,7 +3505,8 @@ simde_vreinterpretq_f16_f64(simde_float64x2_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_f64 #define vreinterpretq_f16_f64(a) simde_vreinterpretq_f16_f64(a) #endif @@ -3500,7 +3523,8 @@ simde_vreinterpret_f32_f16(simde_float16x4_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f32_f16 #define vreinterpret_f32_f16 simde_vreinterpret_f32_f16 #endif @@ -3517,7 +3541,8 @@ simde_vreinterpretq_f32_f16(simde_float16x8_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f32_f16 #define vreinterpretq_f32_f16 simde_vreinterpretq_f32_f16 #endif @@ -3534,7 +3559,8 @@ simde_vreinterpret_f64_f16(simde_float16x4_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f64_f16 #define vreinterpret_f64_f16 simde_vreinterpret_f64_f16 #endif @@ -3551,7 +3577,8 @@ simde_vreinterpretq_f64_f16(simde_float16x8_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f64_f16 #define vreinterpretq_f64_f16 simde_vreinterpretq_f64_f16 #endif @@ -3568,7 +3595,8 @@ simde_vreinterpret_u8_f16(simde_float16x4_t a) { return simde_uint8x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_u8_f16 #define vreinterpret_u8_f16(a) simde_vreinterpret_u8_f16(a) #endif @@ -3585,7 +3613,8 @@ simde_vreinterpretq_u8_f16(simde_float16x8_t a) { return simde_uint8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_u8_f16 #define vreinterpretq_u8_f16(a) simde_vreinterpretq_u8_f16(a) #endif @@ -3602,7 +3631,8 @@ simde_vreinterpret_s8_f16(simde_float16x4_t a) { return simde_int8x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_s8_f16 #define vreinterpret_s8_f16(a) simde_vreinterpret_s8_f16(a) #endif @@ -3619,7 +3649,8 @@ simde_vreinterpretq_s8_f16(simde_float16x8_t a) { return simde_int8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_s8_f16 #define vreinterpretq_s8_f16(a) simde_vreinterpretq_s8_f16(a) #endif @@ -3636,7 +3667,8 @@ simde_vreinterpret_s16_f16(simde_float16x4_t a) { return simde_int16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_s16_f16 #define vreinterpret_s16_f16(a) simde_vreinterpret_s16_f16(a) #endif @@ -3653,7 +3685,8 @@ simde_vreinterpretq_s16_f16(simde_float16x8_t a) { return simde_int16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_s16_f16 #define vreinterpretq_s16_f16(a) simde_vreinterpretq_s16_f16(a) #endif @@ -3670,7 +3703,8 @@ simde_vreinterpret_s32_f16(simde_float16x4_t a) { return simde_int32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_s32_f16 #define vreinterpret_s32_f16(a) simde_vreinterpret_s32_f16(a) #endif @@ -3687,7 +3721,8 @@ simde_vreinterpretq_s32_f16(simde_float16x8_t a) { return simde_int32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_s32_f16 #define vreinterpretq_s32_f16(a) simde_vreinterpretq_s32_f16(a) #endif @@ -3704,7 +3739,8 @@ simde_vreinterpret_s64_f16(simde_float16x4_t a) { return simde_int64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_s64_f16 #define vreinterpret_s64_f16(a) simde_vreinterpret_s64_f16(a) #endif @@ -3721,7 +3757,8 @@ simde_vreinterpretq_s64_f16(simde_float16x8_t a) { return simde_int64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_s64_f16 #define vreinterpretq_s64_f16(a) simde_vreinterpretq_s64_f16(a) #endif @@ -3738,7 +3775,8 @@ simde_vreinterpret_u32_f16(simde_float16x4_t a) { return simde_uint32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_u32_f16 #define vreinterpret_u32_f16(a) simde_vreinterpret_u32_f16(a) #endif @@ -3755,7 +3793,8 @@ simde_vreinterpretq_u32_f16(simde_float16x8_t a) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_u32_f16 #define vreinterpretq_u32_f16(a) simde_vreinterpretq_u32_f16(a) #endif @@ -3772,7 +3811,8 @@ simde_vreinterpretq_u64_f16(simde_float16x8_t a) { return simde_uint64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_u64_f16 #define vreinterpretq_u64_f16 simde_vreinterpretq_u64_f16 #endif @@ -4163,7 +4203,8 @@ simde_vreinterpret_p16_f16(simde_float16x4_t a) { return simde_poly16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_p16_f16 #define vreinterpret_p16_f16(a) simde_vreinterpret_p16_f16(a) #endif @@ -4350,7 +4391,8 @@ simde_vreinterpretq_p16_f16(simde_float16x8_t a) { return simde_poly16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_p16_f16 #define vreinterpretq_p16_f16(a) simde_vreinterpretq_p16_f16(a) #endif @@ -4452,7 +4494,8 @@ simde_vreinterpret_p64_f16(simde_float16x4_t a) { return simde_poly64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_p64_f16 #define vreinterpret_p64_f16 simde_vreinterpret_p64_f16 #endif @@ -4639,7 +4682,8 @@ simde_vreinterpret_p8_f16(simde_float16x4_t a) { return simde_poly8x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_p8_f16 #define vreinterpret_p8_f16(a) simde_vreinterpret_p8_f16(a) #endif @@ -4656,7 +4700,8 @@ simde_vreinterpretq_p8_f16(simde_float16x8_t a) { return simde_poly8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_p8_f16 #define vreinterpretq_p8_f16(a) simde_vreinterpretq_p8_f16(a) #endif @@ -4673,7 +4718,8 @@ simde_vreinterpretq_p64_f16(simde_float16x8_t a) { return simde_poly64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_p64_f16 #define vreinterpretq_p64_f16 simde_vreinterpretq_p64_f16 #endif @@ -5132,7 +5178,8 @@ simde_vreinterpret_f16_p16(simde_poly16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_p16 #define vreinterpret_f16_p16(a) simde_vreinterpret_f16_p16(a) #endif @@ -5183,7 +5230,8 @@ simde_vreinterpretq_f16_p16(simde_poly16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_p16 #define vreinterpretq_f16_p16(a) simde_vreinterpretq_f16_p16(a) #endif @@ -5302,7 +5350,8 @@ simde_vreinterpret_f16_p64(simde_poly64x1_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_p64 #define vreinterpret_f16_p64 simde_vreinterpret_f16_p64 #endif @@ -5319,7 +5368,8 @@ simde_vreinterpret_f16_p8(simde_poly8x8_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpret_f16_p8 #define vreinterpret_f16_p8 simde_vreinterpret_f16_p8 #endif @@ -5336,7 +5386,8 @@ simde_vreinterpretq_f16_p64(simde_poly64x2_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_p64 #define vreinterpretq_f16_p64(a) simde_vreinterpretq_f16_p64(a) #endif @@ -5353,7 +5404,8 @@ simde_vreinterpretq_f16_p8(simde_poly8x16_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_p8 #define vreinterpretq_f16_p8(a) simde_vreinterpretq_f16_p8(a) #endif @@ -6178,7 +6230,7 @@ simde_vreinterpretq_p64_u64(simde_uint64x2_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_s8(simde_int8x16_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_s8(a); #else simde_poly128_t r_; @@ -6195,7 +6247,7 @@ simde_vreinterpretq_p128_s8(simde_int8x16_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_s16(simde_int16x8_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_s16(a); #else simde_poly128_t r_; @@ -6212,7 +6264,7 @@ simde_vreinterpretq_p128_s16(simde_int16x8_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_s32(simde_int32x4_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_s32(a); #else simde_poly128_t r_; @@ -6229,7 +6281,7 @@ simde_vreinterpretq_p128_s32(simde_int32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_s64(simde_int64x2_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_s64(a); #else simde_poly128_t r_; @@ -6246,7 +6298,7 @@ simde_vreinterpretq_p128_s64(simde_int64x2_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_u8(simde_uint8x16_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_u8(a); #else simde_poly128_t r_; @@ -6263,7 +6315,7 @@ simde_vreinterpretq_p128_u8(simde_uint8x16_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_u16(simde_uint16x8_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_u16(a); #else simde_poly128_t r_; @@ -6280,7 +6332,7 @@ simde_vreinterpretq_p128_u16(simde_uint16x8_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_u32(simde_uint32x4_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_u32(a); #else simde_poly128_t r_; @@ -6297,7 +6349,7 @@ simde_vreinterpretq_p128_u32(simde_uint32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_u64(simde_uint64x2_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_u64(a); #else simde_poly128_t r_; @@ -6314,7 +6366,7 @@ simde_vreinterpretq_p128_u64(simde_uint64x2_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_p8(simde_poly8x16_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_p8(a); #else simde_poly128_t r_; @@ -6331,7 +6383,7 @@ simde_vreinterpretq_p128_p8(simde_poly8x16_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_p16(simde_poly16x8_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_p16(a); #else simde_poly128_t r_; @@ -6348,7 +6400,7 @@ simde_vreinterpretq_p128_p16(simde_poly16x8_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_f16(simde_float16x8_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vreinterpretq_p128_f16(a); #else simde_poly128_t r_; @@ -6357,7 +6409,8 @@ simde_vreinterpretq_p128_f16(simde_float16x8_t a) { return r_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_p128_f16 #define vreinterpretq_p128_f16(a) simde_vreinterpretq_p128_f16(a) #endif @@ -6365,7 +6418,7 @@ simde_vreinterpretq_p128_f16(simde_float16x8_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_f32(simde_float32x4_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p128_f32(a); #else simde_poly128_t r_; @@ -6382,7 +6435,7 @@ simde_vreinterpretq_p128_f32(simde_float32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly128_t simde_vreinterpretq_p128_f64(simde_float64x2_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_p128_f64(a); #else simde_poly128_t r_; @@ -6391,7 +6444,7 @@ simde_vreinterpretq_p128_f64(simde_float64x2_t a) { return r_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vreinterpretq_p128_f64 #define vreinterpretq_p128_f64(a) simde_vreinterpretq_p128_f64(a) #endif @@ -6399,7 +6452,7 @@ simde_vreinterpretq_p128_f64(simde_float64x2_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_int8x16_t simde_vreinterpretq_s8_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_s8_p128(a); #else simde_int8x16_private r_; @@ -6416,7 +6469,7 @@ simde_vreinterpretq_s8_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_int16x8_t simde_vreinterpretq_s16_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_s16_p128(a); #else simde_int16x8_private r_; @@ -6433,7 +6486,7 @@ simde_vreinterpretq_s16_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_int32x4_t simde_vreinterpretq_s32_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_s32_p128(a); #else simde_int32x4_private r_; @@ -6450,7 +6503,7 @@ simde_vreinterpretq_s32_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vreinterpretq_s64_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_s64_p128(a); #else simde_int64x2_private r_; @@ -6467,7 +6520,7 @@ simde_vreinterpretq_s64_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_uint8x16_t simde_vreinterpretq_u8_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_u8_p128(a); #else simde_uint8x16_private r_; @@ -6484,7 +6537,7 @@ simde_vreinterpretq_u8_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_uint16x8_t simde_vreinterpretq_u16_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_u16_p128(a); #else simde_uint16x8_private r_; @@ -6501,7 +6554,7 @@ simde_vreinterpretq_u16_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vreinterpretq_u32_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_u32_p128(a); #else simde_uint32x4_private r_; @@ -6518,7 +6571,7 @@ simde_vreinterpretq_u32_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_vreinterpretq_u64_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_u64_p128(a); #else simde_uint64x2_private r_; @@ -6535,7 +6588,7 @@ simde_vreinterpretq_u64_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly8x16_t simde_vreinterpretq_p8_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p8_p128(a); #else simde_poly8x16_private r_; @@ -6552,7 +6605,7 @@ simde_vreinterpretq_p8_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_poly16x8_t simde_vreinterpretq_p16_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) return vreinterpretq_p16_p128(a); #else simde_poly16x8_private r_; @@ -6569,7 +6622,7 @@ simde_vreinterpretq_p16_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t simde_vreinterpretq_f16_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vreinterpretq_f16_p128(a); #else simde_float16x8_private r_; @@ -6578,7 +6631,8 @@ simde_vreinterpretq_f16_p128(simde_poly128_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vreinterpretq_f16_p128 #define vreinterpretq_f16_p128(a) simde_vreinterpretq_f16_p128(a) #endif @@ -6586,7 +6640,7 @@ simde_vreinterpretq_f16_p128(simde_poly128_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vreinterpretq_f64_p128(simde_poly128_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vreinterpretq_f64_p128(a); #else simde_float64x2_private r_; @@ -6595,7 +6649,7 @@ simde_vreinterpretq_f64_p128(simde_poly128_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vreinterpretq_f64_p128 #define vreinterpretq_f64_p128(a) simde_vreinterpretq_f64_p128(a) #endif @@ -6614,7 +6668,8 @@ simde_vreinterpret_bf16_s8(simde_int8x8_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_s8 #define vreinterpret_bf16_s8(a) simde_vreinterpret_bf16_s8(a) #endif @@ -6631,7 +6686,8 @@ simde_vreinterpret_bf16_s16(simde_int16x4_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_s16 #define vreinterpret_bf16_s16(a) simde_vreinterpret_bf16_s16(a) #endif @@ -6648,7 +6704,8 @@ simde_vreinterpret_bf16_s32(simde_int32x2_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_s32 #define vreinterpret_bf16_s32(a) simde_vreinterpret_bf16_s32(a) #endif @@ -6665,7 +6722,8 @@ simde_vreinterpret_bf16_s64(simde_int64x1_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_s64 #define vreinterpret_bf16_s64(a) simde_vreinterpret_bf16_s64(a) #endif @@ -6682,7 +6740,8 @@ simde_vreinterpret_bf16_u8(simde_uint8x8_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_u8 #define vreinterpret_bf16_u8(a) simde_vreinterpret_bf16_u8(a) #endif @@ -6699,7 +6758,8 @@ simde_vreinterpret_bf16_u16(simde_uint16x4_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_u16 #define vreinterpret_bf16_u16(a) simde_vreinterpret_bf16_u16(a) #endif @@ -6716,7 +6776,8 @@ simde_vreinterpret_bf16_u32(simde_uint32x2_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_u32 #define vreinterpret_bf16_u32(a) simde_vreinterpret_bf16_u32(a) #endif @@ -6733,7 +6794,8 @@ simde_vreinterpret_bf16_u64(simde_uint64x1_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_u64 #define vreinterpret_bf16_u64(a) simde_vreinterpret_bf16_u64(a) #endif @@ -6750,7 +6812,8 @@ simde_vreinterpret_bf16_f32(simde_float32x2_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_f32 #define vreinterpret_bf16_f32 simde_vreinterpret_bf16_f32 #endif @@ -6767,7 +6830,8 @@ simde_vreinterpret_bf16_f64(simde_float64x1_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_f64 #define vreinterpret_bf16_f64 simde_vreinterpret_bf16_f64 #endif @@ -6784,7 +6848,8 @@ simde_vreinterpretq_bf16_s8(simde_int8x16_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_s8 #define vreinterpretq_bf16_s8(a) simde_vreinterpretq_bf16_s8(a) #endif @@ -6801,7 +6866,8 @@ simde_vreinterpretq_bf16_s16(simde_int16x8_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_s16 #define vreinterpretq_bf16_s16(a) simde_vreinterpretq_bf16_s16(a) #endif @@ -6818,7 +6884,8 @@ simde_vreinterpretq_bf16_s32(simde_int32x4_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_s32 #define vreinterpretq_bf16_s32(a) simde_vreinterpretq_bf16_s32(a) #endif @@ -6835,7 +6902,8 @@ simde_vreinterpretq_bf16_s64(simde_int64x2_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_s64 #define vreinterpretq_bf16_s64(a) simde_vreinterpretq_bf16_s64(a) #endif @@ -6852,7 +6920,8 @@ simde_vreinterpretq_bf16_u8(simde_uint8x16_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_u8 #define vreinterpretq_bf16_u8(a) simde_vreinterpretq_bf16_u8(a) #endif @@ -6869,7 +6938,8 @@ simde_vreinterpretq_bf16_u16(simde_uint16x8_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_u16 #define vreinterpretq_bf16_u16(a) simde_vreinterpretq_bf16_u16(a) #endif @@ -6886,7 +6956,8 @@ simde_vreinterpretq_bf16_u32(simde_uint32x4_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_u32 #define vreinterpretq_bf16_u32(a) simde_vreinterpretq_bf16_u32(a) #endif @@ -6903,7 +6974,8 @@ simde_vreinterpretq_bf16_u64(simde_uint64x2_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_u64 #define vreinterpretq_bf16_u64(a) simde_vreinterpretq_bf16_u64(a) #endif @@ -6920,7 +6992,8 @@ simde_vreinterpretq_bf16_f32(simde_float32x4_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_f32 #define vreinterpretq_bf16_f32 simde_vreinterpretq_bf16_f32 #endif @@ -6937,7 +7010,8 @@ simde_vreinterpretq_bf16_f64(simde_float64x2_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_f64 #define vreinterpretq_bf16_f64 simde_vreinterpretq_bf16_f64 #endif @@ -6954,7 +7028,8 @@ simde_vreinterpret_s8_bf16(simde_bfloat16x4_t a) { return simde_int8x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_s8_bf16 #define vreinterpret_s8_bf16(a) simde_vreinterpret_s8_bf16(a) #endif @@ -6971,7 +7046,8 @@ simde_vreinterpret_s16_bf16(simde_bfloat16x4_t a) { return simde_int16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_s16_bf16 #define vreinterpret_s16_bf16(a) simde_vreinterpret_s16_bf16(a) #endif @@ -6988,7 +7064,8 @@ simde_vreinterpret_s32_bf16(simde_bfloat16x4_t a) { return simde_int32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_s32_bf16 #define vreinterpret_s32_bf16(a) simde_vreinterpret_s32_bf16(a) #endif @@ -7005,7 +7082,8 @@ simde_vreinterpret_s64_bf16(simde_bfloat16x4_t a) { return simde_int64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_s64_bf16 #define vreinterpret_s64_bf16(a) simde_vreinterpret_s64_bf16(a) #endif @@ -7022,7 +7100,8 @@ simde_vreinterpret_u8_bf16(simde_bfloat16x4_t a) { return simde_uint8x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_u8_bf16 #define vreinterpret_u8_bf16(a) simde_vreinterpret_u8_bf16(a) #endif @@ -7039,7 +7118,8 @@ simde_vreinterpret_u16_bf16(simde_bfloat16x4_t a) { return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_u16_bf16 #define vreinterpret_u16_bf16(a) simde_vreinterpret_u16_bf16(a) #endif @@ -7056,7 +7136,8 @@ simde_vreinterpret_u32_bf16(simde_bfloat16x4_t a) { return simde_uint32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_u32_bf16 #define vreinterpret_u32_bf16(a) simde_vreinterpret_u32_bf16(a) #endif @@ -7073,7 +7154,8 @@ simde_vreinterpret_u64_bf16(simde_bfloat16x4_t a) { return simde_uint64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_u64_bf16 #define vreinterpret_u64_bf16(a) simde_vreinterpret_u64_bf16(a) #endif @@ -7090,7 +7172,8 @@ simde_vreinterpret_f32_bf16(simde_bfloat16x4_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_f32_bf16 #define vreinterpret_f32_bf16 simde_vreinterpret_f32_bf16 #endif @@ -7107,7 +7190,8 @@ simde_vreinterpret_f64_bf16(simde_bfloat16x4_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_f64_bf16 #define vreinterpret_f64_bf16 simde_vreinterpret_f64_bf16 #endif @@ -7124,7 +7208,8 @@ simde_vreinterpretq_s8_bf16(simde_bfloat16x8_t a) { return simde_int8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_s8_bf16 #define vreinterpretq_s8_bf16(a) simde_vreinterpretq_s8_bf16(a) #endif @@ -7141,7 +7226,8 @@ simde_vreinterpretq_s16_bf16(simde_bfloat16x8_t a) { return simde_int16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_s16_bf16 #define vreinterpretq_s16_bf16(a) simde_vreinterpretq_s16_bf16(a) #endif @@ -7158,7 +7244,8 @@ simde_vreinterpretq_s32_bf16(simde_bfloat16x8_t a) { return simde_int32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_s32_bf16 #define vreinterpretq_s32_bf16(a) simde_vreinterpretq_s32_bf16(a) #endif @@ -7175,7 +7262,8 @@ simde_vreinterpretq_s64_bf16(simde_bfloat16x8_t a) { return simde_int64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_s64_bf16 #define vreinterpretq_s64_bf16(a) simde_vreinterpretq_s64_bf16(a) #endif @@ -7192,7 +7280,8 @@ simde_vreinterpretq_u8_bf16(simde_bfloat16x8_t a) { return simde_uint8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_u8_bf16 #define vreinterpretq_u8_bf16(a) simde_vreinterpretq_u8_bf16(a) #endif @@ -7209,7 +7298,8 @@ simde_vreinterpretq_u16_bf16(simde_bfloat16x8_t a) { return simde_uint16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_u16_bf16 #define vreinterpretq_u16_bf16(a) simde_vreinterpretq_u16_bf16(a) #endif @@ -7226,7 +7316,8 @@ simde_vreinterpretq_u32_bf16(simde_bfloat16x8_t a) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_u32_bf16 #define vreinterpretq_u32_bf16(a) simde_vreinterpretq_u32_bf16(a) #endif @@ -7243,7 +7334,8 @@ simde_vreinterpretq_u64_bf16(simde_bfloat16x8_t a) { return simde_uint64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_u64_bf16 #define vreinterpretq_u64_bf16(a) simde_vreinterpretq_u64_bf16(a) #endif @@ -7260,7 +7352,8 @@ simde_vreinterpretq_f32_bf16(simde_bfloat16x8_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_f32_bf16 #define vreinterpretq_f32_bf16 simde_vreinterpretq_f32_bf16 #endif @@ -7277,7 +7370,8 @@ simde_vreinterpretq_f64_bf16(simde_bfloat16x8_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_f64_bf16 #define vreinterpretq_f64_bf16 simde_vreinterpretq_f64_bf16 #endif @@ -7294,7 +7388,8 @@ simde_vreinterpret_bf16_p8(simde_poly8x8_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_p8 #define vreinterpret_bf16_p8(a) simde_vreinterpret_bf16_p8(a) #endif @@ -7311,7 +7406,8 @@ simde_vreinterpret_bf16_p16(simde_poly16x4_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_p16 #define vreinterpret_bf16_p16(a) simde_vreinterpret_bf16_p16(a) #endif @@ -7328,7 +7424,8 @@ simde_vreinterpret_bf16_p64(simde_poly64x1_t a) { return simde_bfloat16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_bf16_p64 #define vreinterpret_bf16_p64(a) simde_vreinterpret_bf16_p64(a) #endif @@ -7345,7 +7442,8 @@ simde_vreinterpretq_bf16_p8(simde_poly8x16_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_p8 #define vreinterpretq_bf16_p8(a) simde_vreinterpretq_bf16_p8(a) #endif @@ -7362,7 +7460,8 @@ simde_vreinterpretq_bf16_p16(simde_poly16x8_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_p16 #define vreinterpretq_bf16_p16(a) simde_vreinterpretq_bf16_p16(a) #endif @@ -7379,7 +7478,8 @@ simde_vreinterpretq_bf16_p64(simde_poly64x2_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_p64 #define vreinterpretq_bf16_p64(a) simde_vreinterpretq_bf16_p64(a) #endif @@ -7396,7 +7496,8 @@ simde_vreinterpret_p8_bf16(simde_bfloat16x4_t a) { return simde_poly8x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_p8_bf16 #define vreinterpret_p8_bf16(a) simde_vreinterpret_p8_bf16(a) #endif @@ -7413,7 +7514,8 @@ simde_vreinterpret_p16_bf16(simde_bfloat16x4_t a) { return simde_poly16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_p16_bf16 #define vreinterpret_p16_bf16(a) simde_vreinterpret_p16_bf16(a) #endif @@ -7430,7 +7532,8 @@ simde_vreinterpret_p64_bf16(simde_bfloat16x4_t a) { return simde_poly64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpret_p64_bf16 #define vreinterpret_p64_bf16(a) simde_vreinterpret_p64_bf16(a) #endif @@ -7447,7 +7550,8 @@ simde_vreinterpretq_p8_bf16(simde_bfloat16x8_t a) { return simde_poly8x16_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_p8_bf16 #define vreinterpretq_p8_bf16(a) simde_vreinterpretq_p8_bf16(a) #endif @@ -7464,7 +7568,8 @@ simde_vreinterpretq_p16_bf16(simde_bfloat16x8_t a) { return simde_poly16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_p16_bf16 #define vreinterpretq_p16_bf16(a) simde_vreinterpretq_p16_bf16(a) #endif @@ -7481,7 +7586,8 @@ simde_vreinterpretq_p64_bf16(simde_bfloat16x8_t a) { return simde_poly64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_p64_bf16 #define vreinterpretq_p64_bf16(a) simde_vreinterpretq_p64_bf16(a) #endif @@ -7500,7 +7606,8 @@ simde_vreinterpretq_p128_bf16(simde_bfloat16x8_t a) { return r_; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_p128_bf16 #define vreinterpretq_p128_bf16(a) simde_vreinterpretq_p128_bf16(a) #endif @@ -7517,7 +7624,8 @@ simde_vreinterpretq_bf16_p128(simde_poly128_t a) { return simde_bfloat16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_BF16))) #undef vreinterpretq_bf16_p128 #define vreinterpretq_bf16_p128(a) simde_vreinterpretq_bf16_p128(a) #endif diff --git a/thirdparty/simde/arm/neon/rev64.h b/thirdparty/simde/arm/neon/rev64.h index 565fd5902..e6c85bb52 100644 --- a/thirdparty/simde/arm/neon/rev64.h +++ b/thirdparty/simde/arm/neon/rev64.h @@ -184,7 +184,7 @@ simde_vrev64_f16(simde_float16x4_t a) { return simde_vreinterpret_f16_s16(simde_vrev64_s16(simde_vreinterpret_s16_f16(a))); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrev64_f16 #define vrev64_f16(a) simde_vrev64_f16(a) #endif @@ -374,7 +374,7 @@ simde_vrev64q_f16(simde_float16x8_t a) { return simde_vreinterpretq_f16_s16(simde_vrev64q_s16(simde_vreinterpretq_s16_f16(a))); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrev64q_f16 #define vrev64q_f16(a) simde_vrev64q_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/rnd.h b/thirdparty/simde/arm/neon/rnd.h index c663cdd90..64ea7ec19 100644 --- a/thirdparty/simde/arm/neon/rnd.h +++ b/thirdparty/simde/arm/neon/rnd.h @@ -43,7 +43,7 @@ simde_vrndh_f16(simde_float16_t a) { return simde_float16_from_float32(simde_math_truncf(simde_float16_to_float32(a))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrndh_f16 #define vrndh_f16(a) simde_vrndh_f16(a) #endif @@ -66,7 +66,7 @@ simde_vrnd_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrnd_f16 #define vrnd_f16(a) simde_vrnd_f16(a) #endif @@ -135,7 +135,7 @@ simde_vrndq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrndq_f16 #define vrndq_f16(a) simde_vrndq_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/rnd32x.h b/thirdparty/simde/arm/neon/rnd32x.h index 38d369aab..560f1ce0a 100644 --- a/thirdparty/simde/arm/neon/rnd32x.h +++ b/thirdparty/simde/arm/neon/rnd32x.h @@ -37,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrnd32x_f32(simde_float32x2_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) return vrnd32x_f32(a); #else simde_float32x2_private @@ -59,7 +59,7 @@ simde_vrnd32x_f32(simde_float32x2_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd32x_f32 #define vrnd32x_f32(a) simde_vrnd32x_f32(a) #endif @@ -67,7 +67,7 @@ simde_vrnd32x_f32(simde_float32x2_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vrnd32x_f64(simde_float64x1_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) return vrnd32x_f64(a); #else simde_float64x1_private @@ -89,7 +89,7 @@ simde_vrnd32x_f64(simde_float64x1_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd32x_f64 #define vrnd32x_f64(a) simde_vrnd32x_f64(a) #endif @@ -97,7 +97,7 @@ simde_vrnd32x_f64(simde_float64x1_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrnd32xq_f32(simde_float32x4_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) return vrnd32xq_f32(a); #else simde_float32x4_private @@ -119,7 +119,7 @@ simde_vrnd32xq_f32(simde_float32x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd32xq_f32 #define vrnd32xq_f32(a) simde_vrnd32xq_f32(a) #endif @@ -127,7 +127,7 @@ simde_vrnd32xq_f32(simde_float32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vrnd32xq_f64(simde_float64x2_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) return vrnd32xq_f64(a); #else simde_float64x2_private @@ -149,7 +149,7 @@ simde_vrnd32xq_f64(simde_float64x2_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0))&& !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd32xq_f64 #define vrnd32xq_f64(a) simde_vrnd32xq_f64(a) #endif diff --git a/thirdparty/simde/arm/neon/rnd32z.h b/thirdparty/simde/arm/neon/rnd32z.h index 7000a128e..2b8fe28a5 100644 --- a/thirdparty/simde/arm/neon/rnd32z.h +++ b/thirdparty/simde/arm/neon/rnd32z.h @@ -37,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrnd32z_f32(simde_float32x2_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) return vrnd32z_f32(a); #else simde_float32x2_private @@ -59,7 +59,7 @@ simde_vrnd32z_f32(simde_float32x2_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd32z_f32 #define vrnd32z_f32(a) simde_vrnd32z_f32(a) #endif @@ -67,7 +67,7 @@ simde_vrnd32z_f32(simde_float32x2_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vrnd32z_f64(simde_float64x1_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) return vrnd32z_f64(a); #else simde_float64x1_private @@ -89,7 +89,7 @@ simde_vrnd32z_f64(simde_float64x1_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd32z_f64 #define vrnd32z_f64(a) simde_vrnd32z_f64(a) #endif @@ -97,7 +97,7 @@ simde_vrnd32z_f64(simde_float64x1_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrnd32zq_f32(simde_float32x4_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) return vrnd32zq_f32(a); #else simde_float32x4_private @@ -119,7 +119,7 @@ simde_vrnd32zq_f32(simde_float32x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd32zq_f32 #define vrnd32zq_f32(a) simde_vrnd32zq_f32(a) #endif @@ -127,7 +127,7 @@ simde_vrnd32zq_f32(simde_float32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vrnd32zq_f64(simde_float64x2_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) return vrnd32zq_f64(a); #else simde_float64x2_private @@ -149,7 +149,7 @@ simde_vrnd32zq_f64(simde_float64x2_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0))&& !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd32zq_f64 #define vrnd32zq_f64(a) simde_vrnd32zq_f64(a) #endif diff --git a/thirdparty/simde/arm/neon/rnd64x.h b/thirdparty/simde/arm/neon/rnd64x.h index 8464291ff..76f5df6b8 100644 --- a/thirdparty/simde/arm/neon/rnd64x.h +++ b/thirdparty/simde/arm/neon/rnd64x.h @@ -37,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrnd64x_f32(simde_float32x2_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) return vrnd64x_f32(a); #else simde_float32x2_private @@ -59,7 +59,7 @@ simde_vrnd64x_f32(simde_float32x2_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd64x_f32 #define vrnd64x_f32(a) simde_vrnd64x_f32(a) #endif @@ -67,7 +67,7 @@ simde_vrnd64x_f32(simde_float32x2_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vrnd64x_f64(simde_float64x1_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) return vrnd64x_f64(a); #else simde_float64x1_private @@ -89,7 +89,7 @@ simde_vrnd64x_f64(simde_float64x1_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd64x_f64 #define vrnd64x_f64(a) simde_vrnd64x_f64(a) #endif @@ -97,7 +97,7 @@ simde_vrnd64x_f64(simde_float64x1_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrnd64xq_f32(simde_float32x4_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) return vrnd64xq_f32(a); #else simde_float32x4_private @@ -119,7 +119,7 @@ simde_vrnd64xq_f32(simde_float32x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd64xq_f32 #define vrnd64xq_f32(a) simde_vrnd64xq_f32(a) #endif @@ -127,7 +127,7 @@ simde_vrnd64xq_f32(simde_float32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vrnd64xq_f64(simde_float64x2_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) return vrnd64xq_f64(a); #else simde_float64x2_private @@ -149,7 +149,7 @@ simde_vrnd64xq_f64(simde_float64x2_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd64xq_f64 #define vrnd64xq_f64(a) simde_vrnd64xq_f64(a) #endif diff --git a/thirdparty/simde/arm/neon/rnd64z.h b/thirdparty/simde/arm/neon/rnd64z.h index e63b58290..cff68b3e8 100644 --- a/thirdparty/simde/arm/neon/rnd64z.h +++ b/thirdparty/simde/arm/neon/rnd64z.h @@ -37,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrnd64z_f32(simde_float32x2_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) return vrnd64z_f32(a); #else simde_float32x2_private @@ -59,7 +59,7 @@ simde_vrnd64z_f32(simde_float32x2_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd64z_f32 #define vrnd64z_f32(a) simde_vrnd64z_f32(a) #endif @@ -67,7 +67,7 @@ simde_vrnd64z_f32(simde_float32x2_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vrnd64z_f64(simde_float64x1_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) return vrnd64z_f64(a); #else simde_float64x1_private @@ -89,7 +89,7 @@ simde_vrnd64z_f64(simde_float64x1_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd64z_f64 #define vrnd64z_f64(a) simde_vrnd64z_f64(a) #endif @@ -97,7 +97,7 @@ simde_vrnd64z_f64(simde_float64x1_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrnd64zq_f32(simde_float32x4_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION) return vrnd64zq_f32(a); #else simde_float32x4_private @@ -119,7 +119,7 @@ simde_vrnd64zq_f32(simde_float32x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd64zq_f32 #define vrnd64zq_f32(a) simde_vrnd64zq_f32(a) #endif @@ -127,7 +127,7 @@ simde_vrnd64zq_f32(simde_float32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vrnd64zq_f64(simde_float64x2_t a) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION) return vrnd64zq_f64(a); #else simde_float64x2_private @@ -149,7 +149,7 @@ simde_vrnd64zq_f64(simde_float64x2_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if (defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION)) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) && !defined(HEDLEY_GCC_VERSION))) #undef vrnd64zq_f64 #define vrnd64zq_f64(a) simde_vrnd64zq_f64(a) #endif diff --git a/thirdparty/simde/arm/neon/rnda.h b/thirdparty/simde/arm/neon/rnda.h index 05a540366..169002b5c 100644 --- a/thirdparty/simde/arm/neon/rnda.h +++ b/thirdparty/simde/arm/neon/rnda.h @@ -43,7 +43,7 @@ simde_vrndah_f16(simde_float16_t a) { return simde_float16_from_float32(simde_math_roundf(simde_float16_to_float32(a))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrndah_f16 #define vrndah_f16(a) simde_vrndah_f16(a) #endif @@ -69,7 +69,7 @@ simde_vrnda_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrnda_f16 #define vrnda_f16(a) simde_vrnda_f16(a) #endif @@ -163,7 +163,7 @@ simde_vrndaq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARM_NEON_FP16)) #undef vrndaq_f16 #define vrndaq_f16(a) simde_vrndaq_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/rndi.h b/thirdparty/simde/arm/neon/rndi.h index 6b985ed28..48c7323a6 100644 --- a/thirdparty/simde/arm/neon/rndi.h +++ b/thirdparty/simde/arm/neon/rndi.h @@ -43,7 +43,7 @@ simde_vrndih_f16(simde_float16_t a) { return simde_float16_from_float32(simde_math_nearbyintf(simde_float16_to_float32(a))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) #undef vrndih_f16 #define vrndih_f16(a) simde_vrndih_f16(a) #endif @@ -66,7 +66,7 @@ simde_vrndi_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) #undef vrndi_f16 #define vrndi_f16(a) simde_vrndi_f16(a) #endif @@ -89,7 +89,7 @@ simde_vrndi_f32(simde_float32x2_t a) { return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399))) #undef vrndi_f32 #define vrndi_f32(a) simde_vrndi_f32(a) #endif @@ -112,7 +112,7 @@ simde_vrndi_f64(simde_float64x1_t a) { return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399))) #undef vrndi_f64 #define vrndi_f64(a) simde_vrndi_f64(a) #endif @@ -135,7 +135,7 @@ simde_vrndiq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16))) #undef vrndiq_f16 #define vrndiq_f16(a) simde_vrndiq_f16(a) #endif @@ -162,7 +162,7 @@ simde_vrndiq_f32(simde_float32x4_t a) { return simde_float32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399))) #undef vrndiq_f32 #define vrndiq_f32(a) simde_vrndiq_f32(a) #endif @@ -189,7 +189,7 @@ simde_vrndiq_f64(simde_float64x2_t a) { return simde_float64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(!defined(SIMDE_BUG_GCC_95399))) #undef vrndiq_f64 #define vrndiq_f64(a) simde_vrndiq_f64(a) #endif diff --git a/thirdparty/simde/arm/neon/rndm.h b/thirdparty/simde/arm/neon/rndm.h index 33c2e00df..5f8d0498a 100644 --- a/thirdparty/simde/arm/neon/rndm.h +++ b/thirdparty/simde/arm/neon/rndm.h @@ -43,7 +43,7 @@ simde_vrndmh_f16(simde_float16_t a) { return simde_float16_from_float32(simde_math_floorf(simde_float16_to_float32(a))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndmh_f16 #define vrndmh_f16(a) simde_vrndmh_f16(a) #endif @@ -66,7 +66,7 @@ simde_vrndm_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndm_f16 #define vrndm_f16(a) simde_vrndm_f16(a) #endif @@ -135,7 +135,7 @@ simde_vrndmq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndmq_f16 #define vrndmq_f16(a) simde_vrndmq_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/rndn.h b/thirdparty/simde/arm/neon/rndn.h index c8990a10d..b2289f497 100644 --- a/thirdparty/simde/arm/neon/rndn.h +++ b/thirdparty/simde/arm/neon/rndn.h @@ -47,7 +47,9 @@ simde_vrndnh_f16(simde_float16_t a) { return simde_float16_from_float32(simde_math_roundevenf(a_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && \ + (!defined(HEDLEY_GCC_VERSION) || (defined(SIMDE_ARM_NEON_A64V8_NATIVE) && HEDLEY_GCC_VERSION_CHECK(8,0,0))) && defined(SIMDE_ARM_NEON_FP16))) #undef vrndnh_f16 #define vrndnh_f16(a) simde_vrndnh_f16(a) #endif @@ -64,7 +66,9 @@ simde_vrndns_f32(simde_float32_t a) { return simde_math_roundevenf(a); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && \ + (!defined(HEDLEY_GCC_VERSION) || (defined(SIMDE_ARM_NEON_A64V8_NATIVE) && HEDLEY_GCC_VERSION_CHECK(8,0,0))))) #undef vrndns_f32 #define vrndns_f32(a) simde_vrndns_f32(a) #endif @@ -87,7 +91,8 @@ simde_vrndn_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndn_f16 #define vrndn_f16(a) simde_vrndn_f16(a) #endif @@ -157,7 +162,8 @@ simde_vrndnq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndnq_f16 #define vrndnq_f16(a) simde_vrndnq_f16(a) #endif @@ -192,8 +198,7 @@ simde_vrndnq_f32(simde_float32x4_t a) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vrndnq_f64(simde_float64x2_t a) { - #if \ - defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vrndnq_f64(a); #else simde_float64x2_private diff --git a/thirdparty/simde/arm/neon/rndp.h b/thirdparty/simde/arm/neon/rndp.h index 6b23136c5..ac4f88c14 100644 --- a/thirdparty/simde/arm/neon/rndp.h +++ b/thirdparty/simde/arm/neon/rndp.h @@ -43,7 +43,8 @@ simde_vrndph_f16(simde_float16_t a) { return simde_float16_from_float32(simde_math_ceilf(simde_float16_to_float32(a))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndph_f16 #define vrndph_f16(a) simde_vrndph_f16(a) #endif @@ -66,7 +67,8 @@ simde_vrndp_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndp_f16 #define vrndp_f16(a) simde_vrndp_f16(a) #endif @@ -135,7 +137,8 @@ simde_vrndpq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndpq_f16 #define vrndpq_f16(a) simde_vrndpq_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/rndx.h b/thirdparty/simde/arm/neon/rndx.h index 406f5e753..d12e19850 100644 --- a/thirdparty/simde/arm/neon/rndx.h +++ b/thirdparty/simde/arm/neon/rndx.h @@ -42,7 +42,8 @@ simde_vrndxh_f16(simde_float16_t a) { return simde_float16_from_float32(simde_math_rintf(simde_float16_to_float32(a))); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndxh_f16 #define vrndxh_f16(a) simde_vrndxh_f16(a) #endif @@ -65,7 +66,8 @@ simde_vrndx_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndx_f16 #define vrndx_f16(a) simde_vrndx_f16(a) #endif @@ -134,7 +136,8 @@ simde_vrndxq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrndxq_f16 #define vrndxq_f16(a) simde_vrndxq_f16(a) #endif diff --git a/thirdparty/simde/arm/neon/rshr_n.h b/thirdparty/simde/arm/neon/rshr_n.h index a27495536..bb3de79ba 100644 --- a/thirdparty/simde/arm/neon/rshr_n.h +++ b/thirdparty/simde/arm/neon/rshr_n.h @@ -144,7 +144,7 @@ simde_vrshrq_n_s16 (const simde_int16x8_t a, const int n) #define simde_vrshrq_n_s16(a, n) vrshrq_n_s16((a), (n)) #elif SIMDE_NATURAL_VECTOR_SIZE > 0 #define simde_vrshrq_n_s16(a, n) simde_vsubq_s16(simde_vshrq_n_s16((a), (n)), simde_vreinterpretq_s16_u16( \ - simde_vtstq_u16(simde_vreinterpretq_u16_s16(a), \ + simde_vtstq_u16(simde_vreinterpretq_u16_s16(a), \ simde_vdupq_n_u16(HEDLEY_STATIC_CAST(uint16_t, 1 << ((n) - 1)))))) #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -329,7 +329,7 @@ simde_vrshr_n_s8 (const simde_int8x8_t a, const int n) #define simde_vrshr_n_s8(a, n) vrshr_n_s8((a), (n)) #elif SIMDE_NATURAL_VECTOR_SIZE > 0 #define simde_vrshr_n_s8(a, n) simde_vsub_s8(simde_vshr_n_s8((a), (n)), simde_vreinterpret_s8_u8( \ - simde_vtst_u8(simde_vreinterpret_u8_s8(a), \ + simde_vtst_u8(simde_vreinterpret_u8_s8(a), \ simde_vdup_n_u8(HEDLEY_STATIC_CAST(uint8_t, 1 << ((n) - 1)))))) #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) diff --git a/thirdparty/simde/arm/neon/rsqrte.h b/thirdparty/simde/arm/neon/rsqrte.h index d3a1c5ac1..7899fd2db 100644 --- a/thirdparty/simde/arm/neon/rsqrte.h +++ b/thirdparty/simde/arm/neon/rsqrte.h @@ -51,7 +51,8 @@ simde_vrsqrteh_f16(simde_float16_t a) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrsqrteh_f16 #define vrsqrteh_f16(a) simde_vrsqrteh_f16((a)) #endif @@ -188,7 +189,8 @@ simde_vrsqrte_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrsqrte_f16 #define vrsqrte_f16(a) simde_vrsqrte_f16((a)) #endif @@ -350,7 +352,8 @@ simde_vrsqrteq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrsqrteq_f16 #define vrsqrteq_f16(a) simde_vrsqrteq_f16((a)) #endif diff --git a/thirdparty/simde/arm/neon/rsqrts.h b/thirdparty/simde/arm/neon/rsqrts.h index 633ad3aaf..612a597a1 100644 --- a/thirdparty/simde/arm/neon/rsqrts.h +++ b/thirdparty/simde/arm/neon/rsqrts.h @@ -53,7 +53,8 @@ simde_vrsqrtsh_f16(simde_float16_t a, simde_float16_t b) { ); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrsqrtsh_f16 #define vrsqrtsh_f16(a, b) simde_vrsqrtsh_f16((a), (b)) #endif @@ -101,7 +102,8 @@ simde_vrsqrts_f16(simde_float16x4_t a, simde_float16x4_t b) { ); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrsqrts_f16 #define vrsqrts_f16(a, b) simde_vrsqrts_f16((a), (b)) #endif @@ -163,7 +165,8 @@ simde_vrsqrtsq_f16(simde_float16x8_t a, simde_float16x8_t b) { ); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vrsqrtsq_f16 #define vrsqrtsq_f16(a, b) simde_vrsqrtsq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/set_lane.h b/thirdparty/simde/arm/neon/set_lane.h index 1c230f39b..57813d31b 100644 --- a/thirdparty/simde/arm/neon/set_lane.h +++ b/thirdparty/simde/arm/neon/set_lane.h @@ -48,7 +48,8 @@ simde_vset_lane_f16(simde_float16_t a, simde_float16x4_t v, const int lane) #endif return r; } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vset_lane_f16 #define vset_lane_f16(a, b, c) simde_vset_lane_f16((a), (b), (c)) #endif @@ -260,7 +261,8 @@ simde_vsetq_lane_f16(simde_float16_t a, simde_float16x8_t v, const int lane) #endif return r; } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16))) #undef vsetq_lane_f16 #define vsetq_lane_f16(a, b, c) simde_vsetq_lane_f16((a), (b), (c)) #endif @@ -468,7 +470,8 @@ simde_vset_lane_p8(simde_poly8_t a, simde_poly8x8_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vset_lane_p8(a, b, c) vset_lane_p8((a), (b), (c)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vset_lane_p8 #define vset_lane_p8(a, b, c) simde_vset_lane_p8((a), (b), (c)) #endif @@ -486,7 +489,8 @@ simde_vset_lane_p16(simde_poly16_t a, simde_poly16x4_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vset_lane_p16(a, b, c) vset_lane_p16((a), (b), (c)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vset_lane_p16 #define vset_lane_p16(a, b, c) simde_vset_lane_p16((a), (b), (c)) #endif @@ -504,7 +508,8 @@ simde_vset_lane_p64(simde_poly64_t a, simde_poly64x1_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vset_lane_p64(a, b, c) vset_lane_p64((a), (b), (c)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vset_lane_p64 #define vset_lane_p64(a, b, c) simde_vset_lane_p64((a), (b), (c)) #endif @@ -522,7 +527,8 @@ simde_vsetq_lane_p8(simde_poly8_t a, simde_poly8x16_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vsetq_lane_p8(a, b, c) vsetq_lane_p8((a), (b), (c)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vsetq_lane_p8 #define vsetq_lane_p8(a, b, c) simde_vsetq_lane_p8((a), (b), (c)) #endif @@ -540,7 +546,8 @@ simde_vsetq_lane_p16(simde_poly16_t a, simde_poly16x8_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vsetq_lane_p16(a, b, c) vsetq_lane_p16((a), (b), (c)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vsetq_lane_p16 #define vsetq_lane_p16(a, b, c) simde_vsetq_lane_p16((a), (b), (c)) #endif @@ -558,7 +565,8 @@ simde_vsetq_lane_p64(simde_poly64_t a, simde_poly64x2_t v, const int lane) #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) #define simde_vsetq_lane_p64(a, b, c) vsetq_lane_p64((a), (b), (c)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_CLANG_71362)) #undef vsetq_lane_p64 #define vsetq_lane_p64(a, b, c) simde_vsetq_lane_p64((a), (b), (c)) #endif @@ -577,7 +585,8 @@ simde_vset_lane_bf16(simde_bfloat16_t a, simde_bfloat16x4_t v, const int lane) #endif return r; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vset_lane_bf16 #define vset_lane_bf16(a, b, c) simde_vset_lane_bf16((a), (b), (c)) #endif @@ -596,7 +605,8 @@ simde_vsetq_lane_bf16(simde_bfloat16_t a, simde_bfloat16x8_t v, const int lane) #endif return r; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vsetq_lane_bf16 #define vsetq_lane_bf16(a, b, c) simde_vsetq_lane_bf16((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/sha1.h b/thirdparty/simde/arm/neon/sha1.h index 73b7988e3..d1f680390 100644 --- a/thirdparty/simde/arm/neon/sha1.h +++ b/thirdparty/simde/arm/neon/sha1.h @@ -44,7 +44,8 @@ simde_vsha1h_u32(uint32_t a) { return ROL(a, 32, 30); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha1h_u32 #define vsha1h_u32(a) simde_vsha1h_u32((a)) #endif @@ -74,7 +75,8 @@ simde_vsha1cq_u32(simde_uint32x4_t hash_abcd, uint32_t hash_e, simde_uint32x4_t return simde_uint32x4_from_private(x_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha1cq_u32 #define vsha1cq_u32(hash_abcd, hash_e, wk) simde_vsha1cq_u32((hash_abcd), (hash_e), (wk)) #endif @@ -104,7 +106,8 @@ simde_vsha1mq_u32(simde_uint32x4_t hash_abcd, uint32_t hash_e, simde_uint32x4_t return simde_uint32x4_from_private(x_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha1mq_u32 #define vsha1mq_u32(hash_abcd, hash_e, wk) simde_vsha1mq_u32((hash_abcd), (hash_e), (wk)) #endif @@ -134,7 +137,8 @@ simde_vsha1pq_u32(simde_uint32x4_t hash_abcd, uint32_t hash_e, simde_uint32x4_t return simde_uint32x4_from_private(x_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha1pq_u32 #define vsha1pq_u32(hash_abcd, hash_e, wk) simde_vsha1pq_u32((hash_abcd), (hash_e), (wk)) #endif @@ -161,7 +165,8 @@ simde_vsha1su0q_u32(simde_uint32x4_t w0_3, simde_uint32x4_t w4_7, simde_uint32x4 return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha1su0q_u32 #define vsha1su0q_u32(w0_3, w4_7, w8_11) simde_vsha1su0q_u32((w0_3), (w4_7), (w8_11)) #endif @@ -189,7 +194,8 @@ simde_vsha1su1q_u32(simde_uint32x4_t tw0_3, simde_uint32x4_t tw12_15) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha1su1q_u32 #define vsha1su1q_u32(tw0_3, tw12_15) simde_vsha1su1q_u32((tw0_3), (tw12_15)) #endif diff --git a/thirdparty/simde/arm/neon/sha256.h b/thirdparty/simde/arm/neon/sha256.h index bf3853d0e..38fe3b4e0 100644 --- a/thirdparty/simde/arm/neon/sha256.h +++ b/thirdparty/simde/arm/neon/sha256.h @@ -90,7 +90,8 @@ simde_vsha256hq_u32(simde_uint32x4_t hash_efgh, simde_uint32x4_t hash_abcd, simd return x_simde_sha256hash(hash_efgh, hash_abcd, wk, 1); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha256hq_u32 #define vsha256hq_u32(hash_efgh, hash_abcd, wk) simde_vsha256hq_u32((hash_efgh), (hash_abcd), (wk)) #endif @@ -104,7 +105,8 @@ simde_vsha256h2q_u32(simde_uint32x4_t hash_efgh, simde_uint32x4_t hash_abcd, sim return x_simde_sha256hash(hash_abcd, hash_efgh, wk, 0); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha256h2q_u32 #define vsha256h2q_u32(hash_efgh, hash_abcd, wk) simde_vsha256h2q_u32((hash_efgh), (hash_abcd), (wk)) #endif @@ -134,7 +136,8 @@ simde_vsha256su0q_u32(simde_uint32x4_t w0_3, simde_uint32x4_t w4_7) { #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha256su0q_u32 #define vsha256su0q_u32(w0_3, w4_7) simde_vsha256su0q_u32((w0_3), (w4_7)) #endif @@ -177,7 +180,8 @@ simde_vsha256su1q_u32(simde_uint32x4_t tw0_3, simde_uint32x4_t w8_11, simde_uint #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA2)) #undef vsha256su1q_u32 #define vsha256su1q_u32(tw0_3, w8_11, w12_15) simde_vsha256su1q_u32((tw0_3), (w8_11), (w12_15)) #endif diff --git a/thirdparty/simde/arm/neon/sha512.h b/thirdparty/simde/arm/neon/sha512.h index db90c95dd..734cf34fe 100644 --- a/thirdparty/simde/arm/neon/sha512.h +++ b/thirdparty/simde/arm/neon/sha512.h @@ -62,7 +62,8 @@ simde_vsha512hq_u64(simde_uint64x2_t w, simde_uint64x2_t x, simde_uint64x2_t y) #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA512)) #undef vsha512hq_u64 #define vsha512hq_u64(w, x, y) simde_vsha512hq_u64((w), (x), (y)) #endif @@ -88,7 +89,8 @@ simde_vsha512h2q_u64(simde_uint64x2_t w, simde_uint64x2_t x, simde_uint64x2_t y) return simde_uint64x2_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA512)) #undef vsha512h2q_u64 #define vsha512h2q_u64(w, x, y) simde_vsha512h2q_u64((w), (x), (y)) #endif @@ -112,7 +114,8 @@ simde_vsha512su0q_u64(simde_uint64x2_t w, simde_uint64x2_t x) { #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA512)) #undef vsha512su0q_u64 #define vsha512su0q_u64(w, x) simde_vsha512su0q_u64((w), (x)) #endif @@ -137,7 +140,8 @@ simde_vsha512su1q_u64(simde_uint64x2_t w, simde_uint64x2_t x, simde_uint64x2_t y #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SHA512)) #undef vsha512su1q_u64 #define vsha512su1q_u64(w, x, y) simde_vsha512su1q_u64((w), (x), (y)) #endif diff --git a/thirdparty/simde/arm/neon/shrn_high_n.h b/thirdparty/simde/arm/neon/shrn_high_n.h index bb45c37c8..141ab6307 100644 --- a/thirdparty/simde/arm/neon/shrn_high_n.h +++ b/thirdparty/simde/arm/neon/shrn_high_n.h @@ -73,8 +73,8 @@ SIMDE_BEGIN_DECLS_ #define simde_vshrn_high_n_u16(r, a, n) vshrn_high_n_u16((r), (a), (n)) #else #define simde_vshrn_high_n_u16(r, a, n) \ - simde_vreinterpretq_u8_s8( \ - simde_vcombine_s8(simde_vreinterpret_s8_u8(r), \ + simde_vreinterpretq_u8_s8( \ + simde_vcombine_s8(simde_vreinterpret_s8_u8(r), \ simde_vshrn_n_s16(simde_vreinterpretq_s16_u16(a), (n)))) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -87,7 +87,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vshrn_high_n_u32(r, a, n) \ simde_vreinterpretq_u16_s16( \ - simde_vcombine_s16(simde_vreinterpret_s16_u16(r), \ + simde_vcombine_s16(simde_vreinterpret_s16_u16(r), \ simde_vshrn_n_s32(simde_vreinterpretq_s32_u32(a), (n)))) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -100,7 +100,7 @@ SIMDE_BEGIN_DECLS_ #else #define simde_vshrn_high_n_u64(r, a, n) \ simde_vreinterpretq_u32_s32( \ - simde_vcombine_s32(simde_vreinterpret_s32_u32(r), \ + simde_vcombine_s32(simde_vreinterpret_s32_u32(r), \ simde_vshrn_n_s64(simde_vreinterpretq_s64_u64(a), (n)))) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/thirdparty/simde/arm/neon/shrn_n.h b/thirdparty/simde/arm/neon/shrn_n.h index 5c14a73b0..ae797d8d0 100644 --- a/thirdparty/simde/arm/neon/shrn_n.h +++ b/thirdparty/simde/arm/neon/shrn_n.h @@ -126,7 +126,7 @@ simde_vshrn_n_s64 (const simde_int64x2_t a, const int n) #define simde_vshrn_n_u16(a, n) vshrn_n_u16((a), (n)) #else #define simde_vshrn_n_u16(a, n) \ - simde_vreinterpret_u8_s8( \ + simde_vreinterpret_u8_s8( \ simde_vshrn_n_s16(simde_vreinterpretq_s16_u16(a), (n))) #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) diff --git a/thirdparty/simde/arm/neon/sm3.h b/thirdparty/simde/arm/neon/sm3.h index b94b649e2..737601b08 100644 --- a/thirdparty/simde/arm/neon/sm3.h +++ b/thirdparty/simde/arm/neon/sm3.h @@ -56,14 +56,15 @@ simde_vsm3ss1q_u32(simde_uint32x4_t n, simde_uint32x4_t m, simde_uint32x4_t a) { return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) #undef vsm3ss1q_u32 #define vsm3ss1q_u32(n, m, a) simde_vsm3ss1q_u32((n), (m), (a)) #endif -#if defined(SIMDE_ARCH_RISCV64) && HEDLEY_GCC_VERSION_CHECK(14,0,0) +#if HEDLEY_GCC_VERSION_CHECK(14,0,0) && (defined(SIMDE_ARCH_RISCV64) || defined(SIMDE_ARCH_LOONGARCH)) HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif SIMDE_FUNCTION_ATTRIBUTES @@ -89,9 +90,10 @@ simde_vsm3tt1aq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, return simde_uint32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) - #define simde_vsm3tt1aq_u32(a, b, c, imm2) vsm3tt1aq_u32((a), (b), (c), (imm2)); + #define simde_vsm3tt1aq_u32(a, b, c, imm2) vsm3tt1aq_u32((a), (b), (c), (imm2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) #undef vsm3tt1aq_u32 #define vsm3tt1aq_u32(a, b, c, imm2) simde_vsm3tt1aq_u32((a), (b), (c), (imm2)) #endif @@ -119,9 +121,10 @@ simde_vsm3tt1bq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, return simde_uint32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) - #define simde_vsm3tt1bq_u32(a, b, c, imm2) vsm3tt1bq_u32((a), (b), (c), (imm2)); + #define simde_vsm3tt1bq_u32(a, b, c, imm2) vsm3tt1bq_u32((a), (b), (c), (imm2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) #undef vsm3tt1bq_u32 #define vsm3tt1bq_u32(a, b, c, imm2) simde_vsm3tt1bq_u32((a), (b), (c), (imm2)) #endif @@ -148,9 +151,10 @@ simde_vsm3tt2aq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, return simde_uint32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) - #define simde_vsm3tt2aq_u32(a, b, c, imm2) vsm3tt2aq_u32((a), (b), (c), (imm2)); + #define simde_vsm3tt2aq_u32(a, b, c, imm2) vsm3tt2aq_u32((a), (b), (c), (imm2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) #undef vsm3tt2aq_u32 #define vsm3tt2aq_u32(a, b, c, imm2) simde_vsm3tt2aq_u32((a), (b), (c), (imm2)) #endif @@ -177,14 +181,15 @@ simde_vsm3tt2bq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, return simde_uint32x4_from_private(r_); } #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) - #define simde_vsm3tt2bq_u32(a, b, c, imm2) vsm3tt2bq_u32((a), (b), (c), (imm2)); + #define simde_vsm3tt2bq_u32(a, b, c, imm2) vsm3tt2bq_u32((a), (b), (c), (imm2)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) #undef vsm3tt2bq_u32 #define vsm3tt2bq_u32(a, b, c, imm2) simde_vsm3tt2bq_u32((a), (b), (c), (imm2)) #endif -#if defined(SIMDE_ARCH_RISCV64) && HEDLEY_GCC_VERSION_CHECK(14,0,0) +#if HEDLEY_GCC_VERSION_CHECK(14,0,0) && (defined(SIMDE_ARCH_RISCV64) || defined(SIMDE_ARCH_LOONGARCH)) HEDLEY_DIAGNOSTIC_POP #endif @@ -211,7 +216,8 @@ simde_vsm3partw1q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) #undef vsm3partw1q_u32 #define vsm3partw1q_u32(a, b, c) simde_vsm3partw1q_u32((a), (b), (c)) #endif @@ -244,7 +250,8 @@ simde_vsm3partw2q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c return simde_uint32x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM3)) #undef vsm3partw2q_u32 #define vsm3partw2q_u32(a, b, c) simde_vsm3partw2q_u32((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/sm4.h b/thirdparty/simde/arm/neon/sm4.h index 21e270508..776ada4a5 100644 --- a/thirdparty/simde/arm/neon/sm4.h +++ b/thirdparty/simde/arm/neon/sm4.h @@ -102,7 +102,8 @@ simde_vsm4eq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { return simde_uint32x4_from_private(a_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM4)) #undef vsm4eq_u32 #define vsm4eq_u32(a, b) simde_vsm4eq_u32((a), (b)) #endif @@ -139,7 +140,8 @@ simde_vsm4ekeyq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { return simde_uint32x4_from_private(a_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_SM4)) #undef vsm4ekeyq_u32 #define vsm4ekeyq_u32(a, b) simde_vsm4ekeyq_u32((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/sqrt.h b/thirdparty/simde/arm/neon/sqrt.h index 01f212b50..8ae870262 100644 --- a/thirdparty/simde/arm/neon/sqrt.h +++ b/thirdparty/simde/arm/neon/sqrt.h @@ -46,7 +46,8 @@ simde_vsqrth_f16(simde_float16_t a) { HEDLEY_UNREACHABLE(); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vsqrth_f16 #define vsqrth_f16(a) simde_vsqrth_f16((a)) #endif @@ -73,7 +74,8 @@ simde_vsqrt_f16(simde_float16x4_t a) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vsqrt_f16 #define vsqrt_f16(a) simde_vsqrt_f16((a)) #endif @@ -155,7 +157,8 @@ simde_vsqrtq_f16(simde_float16x8_t a) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vsqrtq_f16 #define vsqrtq_f16(a) simde_vsqrtq_f16((a)) #endif diff --git a/thirdparty/simde/arm/neon/st1.h b/thirdparty/simde/arm/neon/st1.h index 2e9b912a7..858d83606 100644 --- a/thirdparty/simde/arm/neon/st1.h +++ b/thirdparty/simde/arm/neon/st1.h @@ -49,7 +49,8 @@ simde_vst1_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4_t val #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1_f16 #define vst1_f16(a, b) simde_vst1_f16((a), (b)) #endif @@ -261,7 +262,8 @@ simde_vst1q_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_float16x8_t va #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1q_f16 #define vst1q_f16(a, b) simde_vst1q_f16((a), (b)) #endif @@ -604,7 +606,7 @@ simde_vst1q_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x2_t val) SIMDE_FUNCTION_ATTRIBUTES void simde_vstrq_p128(simde_poly128_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_poly128_t val) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) vstrq_p128(ptr, val); #else simde_memcpy(ptr, &val, sizeof(val)); @@ -626,7 +628,8 @@ simde_vst1_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x4_t simde_memcpy(ptr, &val_, sizeof(val_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1_bf16 #define vst1_bf16(a, b) simde_vst1_bf16((a), (b)) #endif @@ -641,7 +644,8 @@ simde_vst1q_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_bfloat16x8_t simde_memcpy(ptr, &val_, sizeof(val_)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1q_bf16 #define vst1q_bf16(a, b) simde_vst1q_bf16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st1_lane.h b/thirdparty/simde/arm/neon/st1_lane.h index 8facb2e77..8d01b13de 100644 --- a/thirdparty/simde/arm/neon/st1_lane.h +++ b/thirdparty/simde/arm/neon/st1_lane.h @@ -28,6 +28,7 @@ #if !defined(SIMDE_ARM_NEON_ST1_LANE_H) #define SIMDE_ARM_NEON_ST1_LANE_H + #include "types.h" HEDLEY_DIAGNOSTIC_PUSH @@ -45,7 +46,8 @@ simde_vst1_lane_f16(simde_float16_t *ptr, simde_float16x4_t val, const int lane) *ptr = val_.values[lane]; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1_lane_f16 #define vst1_lane_f16(a, b, c) simde_vst1_lane_f16((a), (b), (c)) #endif @@ -224,7 +226,8 @@ simde_vst1q_lane_f16(simde_float16_t *ptr, simde_float16x8_t val, const int lane *ptr = val_.values[lane]; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1q_lane_f16 #define vst1q_lane_f16(a, b, c) simde_vst1q_lane_f16((a), (b), (c)) #endif @@ -497,7 +500,8 @@ simde_vst1_lane_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4_t val, const int la *ptr = val_.values[lane]; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1_lane_bf16 #define vst1_lane_bf16(a, b, c) simde_vst1_lane_bf16((a), (b), (c)) #endif @@ -513,7 +517,8 @@ simde_vst1q_lane_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8_t val, const int l *ptr = val_.values[lane]; #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1q_lane_bf16 #define vst1q_lane_bf16(a, b, c) simde_vst1q_lane_bf16((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/st1_x2.h b/thirdparty/simde/arm/neon/st1_x2.h index 2b9f94c96..fed93bd65 100644 --- a/thirdparty/simde/arm/neon/st1_x2.h +++ b/thirdparty/simde/arm/neon/st1_x2.h @@ -31,6 +31,7 @@ #define SIMDE_ARM_NEON_ST1_X2_H #include "types.h" +#include "st1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -58,7 +59,8 @@ simde_vst1_f16_x2(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_float16x4x2_ #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(defined(SIMDE_ARM_NEON_FP16) && !defined(SIMDE_BUG_GCC_REV_260989))) #undef vst1_f16_x2 #define vst1_f16_x2(ptr, val) simde_vst1_f16_x2((ptr), (val)) #endif @@ -73,7 +75,8 @@ simde_vst1_f32_x2(simde_float32 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x2x2_t simde_vst1_f32(ptr+2, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_f32_x2 #define vst1_f32_x2(ptr, val) simde_vst1_f32_x2((ptr), (val)) #endif @@ -103,7 +106,8 @@ simde_vst1_s8_x2(int8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int8x8x2_t val) { simde_vst1_s8(ptr+8, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_s8_x2 #define vst1_s8_x2(ptr, val) simde_vst1_s8_x2((ptr), (val)) #endif @@ -118,7 +122,8 @@ simde_vst1_s16_x2(int16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int16x4x2_t val) { simde_vst1_s16(ptr+4, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_s16_x2 #define vst1_s16_x2(ptr, val) simde_vst1_s16_x2((ptr), (val)) #endif @@ -133,7 +138,8 @@ simde_vst1_s32_x2(int32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int32x2x2_t val) { simde_vst1_s32(ptr+2, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_s32_x2 #define vst1_s32_x2(ptr, val) simde_vst1_s32_x2((ptr), (val)) #endif @@ -148,7 +154,8 @@ simde_vst1_s64_x2(int64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x1x2_t val) { simde_vst1_s64(ptr+1, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_s64_x2 #define vst1_s64_x2(ptr, val) simde_vst1_s64_x2((ptr), (val)) #endif @@ -163,7 +170,8 @@ simde_vst1_u8_x2(uint8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint8x8x2_t val) { simde_vst1_u8(ptr+8, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_u8_x2 #define vst1_u8_x2(ptr, val) simde_vst1_u8_x2((ptr), (val)) #endif @@ -178,7 +186,8 @@ simde_vst1_u16_x2(uint16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint16x4x2_t val) { simde_vst1_u16(ptr+4, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_u16_x2 #define vst1_u16_x2(ptr, val) simde_vst1_u16_x2((ptr), (val)) #endif @@ -193,7 +202,8 @@ simde_vst1_u32_x2(uint32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint32x2x2_t val) { simde_vst1_u32(ptr+2, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_u32_x2 #define vst1_u32_x2(ptr, val) simde_vst1_u32_x2((ptr), (val)) #endif @@ -208,7 +218,8 @@ simde_vst1_u64_x2(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x1x2_t val) { simde_vst1_u64(ptr+1, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_u64_x2 #define vst1_u64_x2(ptr, val) simde_vst1_u64_x2((ptr), (val)) #endif @@ -232,7 +243,8 @@ simde_vst1_p8_x2(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly8x8x2_t va #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1_p8_x2 #define vst1_p8_x2(a, b) simde_vst1_p8_x2((a), (b)) #endif @@ -256,7 +268,8 @@ simde_vst1_p16_x2(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly16x4x2_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1_p16_x2 #define vst1_p16_x2(a, b) simde_vst1_p16_x2((a), (b)) #endif @@ -280,7 +293,8 @@ simde_vst1_p64_x2(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x1x2_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1_p64_x2 #define vst1_p64_x2(a, b) simde_vst1_p64_x2((a), (b)) #endif @@ -298,7 +312,8 @@ simde_vst1_bf16_x2(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_bfloat16x4 simde_memcpy(ptr, &val_, sizeof(val_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1_bf16_x2 #define vst1_bf16_x2(a, b) simde_vst1_bf16_x2((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st1_x3.h b/thirdparty/simde/arm/neon/st1_x3.h index 510c9d67e..097c4aabd 100644 --- a/thirdparty/simde/arm/neon/st1_x3.h +++ b/thirdparty/simde/arm/neon/st1_x3.h @@ -31,6 +31,7 @@ #define SIMDE_ARM_NEON_ST1_X3_H #include "types.h" +#include "st1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -60,7 +61,8 @@ simde_vst1_f16_x3(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float16x4x3 #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1_f16_x3 #define vst1_f16_x3(a, b) simde_vst1_f16_x3((a), (b)) #endif @@ -76,7 +78,8 @@ simde_vst1_f32_x3(simde_float32 ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t simde_vst1_f32(ptr+4, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_f32_x3 #define vst1_f32_x3(ptr, val) simde_vst1_f32_x3((ptr), (val)) #endif @@ -108,7 +111,8 @@ simde_vst1_s8_x3(int8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int8x8x3_t val) { simde_vst1_s8(ptr+16, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_s8_x3 #define vst1_s8_x3(ptr, val) simde_vst1_s8_x3((ptr), (val)) #endif @@ -124,7 +128,8 @@ simde_vst1_s16_x3(int16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int16x4x3_t val) { simde_vst1_s16(ptr+8, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_s16_x3 #define vst1_s16_x3(ptr, val) simde_vst1_s16_x3((ptr), (val)) #endif @@ -140,7 +145,8 @@ simde_vst1_s32_x3(int32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int32x2x3_t val) { simde_vst1_s32(ptr+4, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_s32_x3 #define vst1_s32_x3(ptr, val) simde_vst1_s32_x3((ptr), (val)) #endif @@ -156,7 +162,8 @@ simde_vst1_s64_x3(int64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x1x3_t val) { simde_vst1_s64(ptr+2, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_s64_x3 #define vst1_s64_x3(ptr, val) simde_vst1_s64_x3((ptr), (val)) #endif @@ -172,7 +179,8 @@ simde_vst1_u8_x3(uint8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint8x8x3_t val) { simde_vst1_u8(ptr+16, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_u8_x3 #define vst1_u8_x3(ptr, val) simde_vst1_u8_x3((ptr), (val)) #endif @@ -188,7 +196,8 @@ simde_vst1_u16_x3(uint16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint16x4x3_t val) simde_vst1_u16(ptr+8, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_u16_x3 #define vst1_u16_x3(ptr, val) simde_vst1_u16_x3((ptr), (val)) #endif @@ -204,7 +213,8 @@ simde_vst1_u32_x3(uint32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint32x2x3_t val) { simde_vst1_u32(ptr+4, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_u32_x3 #define vst1_u32_x3(ptr, val) simde_vst1_u32_x3((ptr), (val)) #endif @@ -220,7 +230,8 @@ simde_vst1_u64_x3(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t val) { simde_vst1_u64(ptr+2, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1_u64_x3 #define vst1_u64_x3(ptr, val) simde_vst1_u64_x3((ptr), (val)) #endif @@ -245,7 +256,8 @@ simde_vst1_p8_x3(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly8x8x3_t va #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1_p8_x3 #define vst1_p8_x3(a, b) simde_vst1_p8_x3((a), (b)) #endif @@ -270,7 +282,8 @@ simde_vst1_p16_x3(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_poly16x4x3_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1_p16_x3 #define vst1_p16_x3(a, b) simde_vst1_p16_x3((a), (b)) #endif @@ -295,7 +308,8 @@ simde_vst1_p64_x3(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1_p64_x3 #define vst1_p64_x3(a, b) simde_vst1_p64_x3((a), (b)) #endif @@ -313,7 +327,8 @@ simde_vst1_bf16_x3(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_bfloat16x simde_memcpy(ptr, &val_, sizeof(val_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1_bf16_x3 #define vst1_bf16_x3(a, b) simde_vst1_bf16_x3((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st1_x4.h b/thirdparty/simde/arm/neon/st1_x4.h index aa8c17db9..5d3ed1541 100644 --- a/thirdparty/simde/arm/neon/st1_x4.h +++ b/thirdparty/simde/arm/neon/st1_x4.h @@ -31,6 +31,7 @@ #define SIMDE_ARM_NEON_ST1_X4_H #include "types.h" +#include "st1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -60,7 +61,8 @@ simde_vst1_f16_x4(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_float16x4x4 #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1_f16_x4 #define vst1_f16_x4(a, b) simde_vst1_f16_x4((a), (b)) #endif @@ -78,7 +80,8 @@ simde_vst1_f32_x4(simde_float32 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float32x2x4_t simde_vst1_f32(ptr+6, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_f32_x4 #define vst1_f32_x4(ptr, val) simde_vst1_f32_x4((ptr), (val)) #endif @@ -95,7 +98,8 @@ simde_vst1_f64_x4(simde_float64 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float64x1x4_t simde_vst1_f64(ptr+3, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_114521)) #undef vst1_f64_x4 #define vst1_f64_x4(ptr, val) simde_vst1_f64_x4((ptr), (val)) #endif @@ -113,7 +117,8 @@ simde_vst1_s8_x4(int8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int8x8x4_t val) { simde_vst1_s8(ptr+24, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_s8_x4 #define vst1_s8_x4(ptr, val) simde_vst1_s8_x4((ptr), (val)) #endif @@ -131,7 +136,8 @@ simde_vst1_s16_x4(int16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int16x4x4_t val) { simde_vst1_s16(ptr+12, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_s16_x4 #define vst1_s16_x4(ptr, val) simde_vst1_s16_x4((ptr), (val)) #endif @@ -149,7 +155,8 @@ simde_vst1_s32_x4(int32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int32x2x4_t val) { simde_vst1_s32(ptr+6, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_s32_x4 #define vst1_s32_x4(ptr, val) simde_vst1_s32_x4((ptr), (val)) #endif @@ -167,7 +174,8 @@ simde_vst1_s64_x4(int64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int64x1x4_t val) { simde_vst1_s64(ptr+3, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_s64_x4 #define vst1_s64_x4(ptr, val) simde_vst1_s64_x4((ptr), (val)) #endif @@ -185,7 +193,8 @@ simde_vst1_u8_x4(uint8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint8x8x4_t val) { simde_vst1_u8(ptr+24, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_u8_x4 #define vst1_u8_x4(ptr, val) simde_vst1_u8_x4((ptr), (val)) #endif @@ -203,7 +212,8 @@ simde_vst1_u16_x4(uint16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint16x4x4_t val) simde_vst1_u16(ptr+12, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_u16_x4 #define vst1_u16_x4(ptr, val) simde_vst1_u16_x4((ptr), (val)) #endif @@ -221,7 +231,8 @@ simde_vst1_u32_x4(uint32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint32x2x4_t val) { simde_vst1_u32(ptr+6, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_u32_x4 #define vst1_u32_x4(ptr, val) simde_vst1_u32_x4((ptr), (val)) #endif @@ -239,7 +250,8 @@ simde_vst1_u64_x4(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x1x4_t val) { simde_vst1_u64(ptr+3, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(SIMDE_BUG_GCC_REV_260989) && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_u64_x4 #define vst1_u64_x4(ptr, val) simde_vst1_u64_x4((ptr), (val)) #endif @@ -266,7 +278,9 @@ simde_vst1_p8_x4(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly8x8x4_t va #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_p8_x4 #define vst1_p8_x4(a, b) simde_vst1_p8_x4((a), (b)) #endif @@ -293,7 +307,9 @@ simde_vst1_p16_x4(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly16x4x4_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_p16_x4 #define vst1_p16_x4(a, b) simde_vst1_p16_x4((a), (b)) #endif @@ -320,7 +336,9 @@ simde_vst1_p64_x4(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !((!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) \ + && !defined(SIMDE_BUG_GCC_114521))) #undef vst1_p64_x4 #define vst1_p64_x4(a, b) simde_vst1_p64_x4((a), (b)) #endif @@ -338,7 +356,8 @@ simde_vst1_bf16_x4(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_bfloat16x simde_memcpy(ptr, &val_, sizeof(val_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1_bf16_x4 #define vst1_bf16_x4(a, b) simde_vst1_bf16_x4((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st1q_x2.h b/thirdparty/simde/arm/neon/st1q_x2.h index 4e96191af..aca91ee90 100644 --- a/thirdparty/simde/arm/neon/st1q_x2.h +++ b/thirdparty/simde/arm/neon/st1q_x2.h @@ -29,6 +29,7 @@ #define SIMDE_ARM_NEON_ST1Q_X2_H #include "types.h" +#include "st1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -56,7 +57,8 @@ simde_vst1q_f16_x2(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_float16x8x #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1q_f16_x2 #define vst1q_f16_x2(a, b) simde_vst1q_f16_x2((a), (b)) #endif @@ -71,7 +73,8 @@ simde_vst1q_f32_x2(simde_float32 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float32x4x2_t simde_vst1q_f32(ptr+4, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_f32_x2 #define vst1q_f32_x2(ptr, val) simde_vst1q_f32_x2((ptr), (val)) #endif @@ -101,7 +104,8 @@ simde_vst1q_s8_x2(int8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int8x16x2_t val) { simde_vst1q_s8(ptr+16, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s8_x2 #define vst1q_s8_x2(ptr, val) simde_vst1q_s8_x2((ptr), (val)) #endif @@ -116,7 +120,8 @@ simde_vst1q_s16_x2(int16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int16x8x2_t val) { simde_vst1q_s16(ptr+8, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s16_x2 #define vst1q_s16_x2(ptr, val) simde_vst1q_s16_x2((ptr), (val)) #endif @@ -131,7 +136,8 @@ simde_vst1q_s32_x2(int32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int32x4x2_t val) { simde_vst1q_s32(ptr+4, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s32_x2 #define vst1q_s32_x2(ptr, val) simde_vst1q_s32_x2((ptr), (val)) #endif @@ -146,7 +152,8 @@ simde_vst1q_s64_x2(int64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int64x2x2_t val) { simde_vst1q_s64(ptr+2, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s64_x2 #define vst1q_s64_x2(ptr, val) simde_vst1q_s64_x2((ptr), (val)) #endif @@ -161,7 +168,8 @@ simde_vst1q_u8_x2(uint8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint8x16x2_t val) { simde_vst1q_u8(ptr+16, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u8_x2 #define vst1q_u8_x2(ptr, val) simde_vst1q_u8_x2((ptr), (val)) #endif @@ -176,7 +184,8 @@ simde_vst1q_u16_x2(uint16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint16x8x2_t val) simde_vst1q_u16(ptr+8, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u16_x2 #define vst1q_u16_x2(ptr, val) simde_vst1q_u16_x2((ptr), (val)) #endif @@ -191,7 +200,8 @@ simde_vst1q_u32_x2(uint32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint32x4x2_t val) simde_vst1q_u32(ptr+4, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u32_x2 #define vst1q_u32_x2(ptr, val) simde_vst1q_u32_x2((ptr), (val)) #endif @@ -206,7 +216,8 @@ simde_vst1q_u64_x2(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x2x2_t val) simde_vst1q_u64(ptr+2, val.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u64_x2 #define vst1q_u64_x2(ptr, val) simde_vst1q_u64_x2((ptr), (val)) #endif @@ -230,7 +241,8 @@ simde_vst1q_p8_x2(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly8x16x2_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1q_p8_x2 #define vst1q_p8_x2(a, b) simde_vst1q_p8_x2((a), (b)) #endif @@ -254,7 +266,8 @@ simde_vst1q_p16_x2(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly16x8x2_ #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1q_p16_x2 #define vst1q_p16_x2(a, b) simde_vst1q_p16_x2((a), (b)) #endif @@ -278,7 +291,8 @@ simde_vst1q_p64_x2(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x2x2_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1q_p64_x2 #define vst1q_p64_x2(a, b) simde_vst1q_p64_x2((a), (b)) #endif @@ -296,7 +310,8 @@ simde_vst1q_bf16_x2(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_bfloat16 simde_memcpy(ptr, &val_, sizeof(val_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1q_bf16_x2 #define vst1q_bf16_x2(a, b) simde_vst1q_bf16_x2((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st1q_x3.h b/thirdparty/simde/arm/neon/st1q_x3.h index 04beeb2c8..73dcb9c8e 100644 --- a/thirdparty/simde/arm/neon/st1q_x3.h +++ b/thirdparty/simde/arm/neon/st1q_x3.h @@ -29,6 +29,7 @@ #define SIMDE_ARM_NEON_ST1Q_X3_H #include "types.h" +#include "st1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -58,7 +59,8 @@ simde_vst1q_f16_x3(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_float16x8x #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1q_f16_x3 #define vst1q_f16_x3(a, b) simde_vst1q_f16_x3((a), (b)) #endif @@ -74,7 +76,8 @@ simde_vst1q_f32_x3(simde_float32 ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_ simde_vst1q_f32(ptr+8, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_f32_x3 #define vst1q_f32_x3(ptr, val) simde_vst1q_f32_x3((ptr), (val)) #endif @@ -106,7 +109,8 @@ simde_vst1q_s8_x3(int8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_int8x16x3_t val) { simde_vst1q_s8(ptr+32, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s8_x3 #define vst1q_s8_x3(ptr, val) simde_vst1q_s8_x3((ptr), (val)) #endif @@ -122,7 +126,8 @@ simde_vst1q_s16_x3(int16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int16x8x3_t val) { simde_vst1q_s16(ptr+16, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s16_x3 #define vst1q_s16_x3(ptr, val) simde_vst1q_s16_x3((ptr), (val)) #endif @@ -138,7 +143,8 @@ simde_vst1q_s32_x3(int32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int32x4x3_t val) { simde_vst1q_s32(ptr+8, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s32_x3 #define vst1q_s32_x3(ptr, val) simde_vst1q_s32_x3((ptr), (val)) #endif @@ -154,7 +160,8 @@ simde_vst1q_s64_x3(int64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int64x2x3_t val) { simde_vst1q_s64(ptr+4, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s64_x3 #define vst1q_s64_x3(ptr, val) simde_vst1q_s64_x3((ptr), (val)) #endif @@ -170,7 +177,8 @@ simde_vst1q_u8_x3(uint8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_uint8x16x3_t val) { simde_vst1q_u8(ptr+32, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u8_x3 #define vst1q_u8_x3(ptr, val) simde_vst1q_u8_x3((ptr), (val)) #endif @@ -186,7 +194,8 @@ simde_vst1q_u16_x3(uint16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint16x8x3_t val) simde_vst1q_u16(ptr+16, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u16_x3 #define vst1q_u16_x3(ptr, val) simde_vst1q_u16_x3((ptr), (val)) #endif @@ -202,7 +211,8 @@ simde_vst1q_u32_x3(uint32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint32x4x3_t val) simde_vst1q_u32(ptr+8, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u32_x3 #define vst1q_u32_x3(ptr, val) simde_vst1q_u32_x3((ptr), (val)) #endif @@ -218,7 +228,8 @@ simde_vst1q_u64_x3(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) simde_vst1q_u64(ptr+4, val.val[2]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u64_x3 #define vst1q_u64_x3(ptr, val) simde_vst1q_u64_x3((ptr), (val)) #endif @@ -243,7 +254,8 @@ simde_vst1q_p8_x3(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_poly8x16x3_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1q_p8_x3 #define vst1q_p8_x3(a, b) simde_vst1q_p8_x3((a), (b)) #endif @@ -268,7 +280,8 @@ simde_vst1q_p16_x3(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly16x8x3_ #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1q_p16_x3 #define vst1q_p16_x3(a, b) simde_vst1q_p16_x3((a), (b)) #endif @@ -293,7 +306,8 @@ simde_vst1q_p64_x3(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_poly64x2x3_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1q_p64_x3 #define vst1q_p64_x3(a, b) simde_vst1q_p64_x3((a), (b)) #endif @@ -311,7 +325,8 @@ simde_vst1q_bf16_x3(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_bfloat16 simde_memcpy(ptr, &val_, sizeof(val_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1q_bf16_x3 #define vst1q_bf16_x3(a, b) simde_vst1q_bf16_x3((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st1q_x4.h b/thirdparty/simde/arm/neon/st1q_x4.h index 9b91c632e..a489e448e 100644 --- a/thirdparty/simde/arm/neon/st1q_x4.h +++ b/thirdparty/simde/arm/neon/st1q_x4.h @@ -31,6 +31,7 @@ #define SIMDE_ARM_NEON_ST1Q_X4_H #include "types.h" +#include "st1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -60,7 +61,8 @@ simde_vst1q_f16_x4(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_float16x8x #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst1q_f16_x4 #define vst1q_f16_x4(a, b) simde_vst1q_f16_x4((a), (b)) #endif @@ -77,7 +79,8 @@ simde_vst1q_f32_x4(simde_float32 ptr[HEDLEY_ARRAY_PARAM(16)], simde_float32x4x4_ simde_vst1q_f32(ptr+12, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_f32_x4 #define vst1q_f32_x4(ptr, val) simde_vst1q_f32_x4((ptr), (val)) #endif @@ -111,7 +114,8 @@ simde_vst1q_s8_x4(int8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_int8x16x4_t val) { simde_vst1q_s8(ptr+48, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s8_x4 #define vst1q_s8_x4(ptr, val) simde_vst1q_s8_x4((ptr), (val)) #endif @@ -128,7 +132,8 @@ simde_vst1q_s16_x4(int16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int16x8x4_t val) { simde_vst1q_s16(ptr+24, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s16_x4 #define vst1q_s16_x4(ptr, val) simde_vst1q_s16_x4((ptr), (val)) #endif @@ -145,7 +150,8 @@ simde_vst1q_s32_x4(int32_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int32x4x4_t val) { simde_vst1q_s32(ptr+12, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s32_x4 #define vst1q_s32_x4(ptr, val) simde_vst1q_s32_x4((ptr), (val)) #endif @@ -162,7 +168,8 @@ simde_vst1q_s64_x4(int64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int64x2x4_t val) { simde_vst1q_s64(ptr+6, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_s64_x4 #define vst1q_s64_x4(ptr, val) simde_vst1q_s64_x4((ptr), (val)) #endif @@ -179,7 +186,8 @@ simde_vst1q_u8_x4(uint8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_uint8x16x4_t val) { simde_vst1q_u8(ptr+48, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u8_x4 #define vst1q_u8_x4(ptr, val) simde_vst1q_u8_x4((ptr), (val)) #endif @@ -196,7 +204,8 @@ simde_vst1q_u16_x4(uint16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint16x8x4_t val) simde_vst1q_u16(ptr+24, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u16_x4 #define vst1q_u16_x4(ptr, val) simde_vst1q_u16_x4((ptr), (val)) #endif @@ -213,7 +222,8 @@ simde_vst1q_u32_x4(uint32_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint32x4x4_t val) simde_vst1q_u32(ptr+12, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u32_x4 #define vst1q_u32_x4(ptr, val) simde_vst1q_u32_x4((ptr), (val)) #endif @@ -230,7 +240,8 @@ simde_vst1q_u64_x4(uint64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint64x2x4_t val) simde_vst1q_u64(ptr+6, val.val[3]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + defined(SIMDE_BUG_GCC_REV_260989)) #undef vst1q_u64_x4 #define vst1q_u64_x4(ptr, val) simde_vst1q_u64_x4((ptr), (val)) #endif @@ -256,7 +267,8 @@ simde_vst1q_p8_x4(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_poly8x16x4_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1q_p8_x4 #define vst1q_p8_x4(a, b) simde_vst1q_p8_x4((a), (b)) #endif @@ -282,7 +294,8 @@ simde_vst1q_p16_x4(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly16x8x4_ #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1q_p16_x4 #define vst1q_p16_x4(a, b) simde_vst1q_p16_x4((a), (b)) #endif @@ -308,7 +321,8 @@ simde_vst1q_p64_x4(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly64x2x4_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !(!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))) #undef vst1q_p64_x4 #define vst1q_p64_x4(a, b) simde_vst1q_p64_x4((a), (b)) #endif @@ -326,7 +340,8 @@ simde_vst1q_bf16_x4(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_bfloat16 simde_memcpy(ptr, &val_, sizeof(val_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst1q_bf16_x4 #define vst1q_bf16_x4(a, b) simde_vst1q_bf16_x4((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st2.h b/thirdparty/simde/arm/neon/st2.h index 75c6385f2..d5bdc7ccb 100644 --- a/thirdparty/simde/arm/neon/st2.h +++ b/thirdparty/simde/arm/neon/st2.h @@ -30,6 +30,7 @@ #if !defined(SIMDE_ARM_NEON_ST2_H) #define SIMDE_ARM_NEON_ST2_H +#include "st1.h" #include "combine.h" #include "zip.h" @@ -61,7 +62,8 @@ simde_vst2_f16(simde_float16_t *ptr, simde_float16x4x2_t val) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst2_f16 #define vst2_f16(a, b) simde_vst2_f16((a), (b)) #endif @@ -370,7 +372,8 @@ simde_vst2q_f16(simde_float16_t *ptr, simde_float16x8x2_t val) { simde_vst1q_f16(ptr+8, r.val[1]); #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst2q_f16 #define vst2q_f16(a, b) simde_vst2q_f16((a), (b)) #endif @@ -794,7 +797,8 @@ simde_vst2_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4x2_t val) { simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst2_bf16 #define vst2_bf16(a, b) simde_vst2_bf16((a), (b)) #endif @@ -814,7 +818,8 @@ simde_vst2q_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8x2_t val) { simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst2q_bf16 #define vst2q_bf16(a, b) simde_vst2q_bf16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st2_lane.h b/thirdparty/simde/arm/neon/st2_lane.h index 3be5c8e79..eb43dfcc1 100644 --- a/thirdparty/simde/arm/neon/st2_lane.h +++ b/thirdparty/simde/arm/neon/st2_lane.h @@ -204,7 +204,8 @@ simde_vst2_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x4x } #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst2_lane_f16 #define vst2_lane_f16(a, b, c) simde_vst2_lane_f16((a), (b), (c)) #endif @@ -414,7 +415,8 @@ simde_vst2q_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x8 } #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst2q_lane_f16 #define vst2q_lane_f16(a, b, c) simde_vst2q_lane_f16((a), (b), (c)) #endif @@ -586,7 +588,8 @@ simde_vst2_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16 } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst2_lane_bf16 #define vst2_lane_bf16(a, b, c) simde_vst2_lane_bf16((a), (b), (c)) #endif @@ -605,7 +608,8 @@ simde_vst2q_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat1 } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst2q_lane_bf16 #define vst2q_lane_bf16(a, b, c) simde_vst2q_lane_bf16((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/st3.h b/thirdparty/simde/arm/neon/st3.h index 29301ae6c..6095fff2c 100644 --- a/thirdparty/simde/arm/neon/st3.h +++ b/thirdparty/simde/arm/neon/st3.h @@ -63,7 +63,8 @@ simde_vst3_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float16x4x3_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst3_f16 #define vst3_f16(a, b) simde_vst3_f16((a), (b)) #endif @@ -469,7 +470,8 @@ simde_vst3q_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_float16x8x3_t #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst3q_f16 #define vst3q_f16(a, b) simde_vst3q_f16((a), (b)) #endif @@ -1144,7 +1146,8 @@ simde_vst3_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_bfloat16x4x3 simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst3_bf16 #define vst3_bf16(a, b) simde_vst3_bf16((a), (b)) #endif @@ -1165,7 +1168,8 @@ simde_vst3q_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_bfloat16x8x simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst3q_bf16 #define vst3q_bf16(a, b) simde_vst3q_bf16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st3_lane.h b/thirdparty/simde/arm/neon/st3_lane.h index d8325b9a3..e07ce6948 100644 --- a/thirdparty/simde/arm/neon/st3_lane.h +++ b/thirdparty/simde/arm/neon/st3_lane.h @@ -204,7 +204,8 @@ simde_vst3_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x4x } #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst3_lane_f16 #define vst3_lane_f16(a, b, c) simde_vst3_lane_f16((a), (b), (c)) #endif @@ -414,7 +415,8 @@ simde_vst3q_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x8 } #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst3q_lane_f16 #define vst3q_lane_f16(a, b, c) simde_vst3q_lane_f16((a), (b), (c)) #endif @@ -440,7 +442,7 @@ simde_vst3q_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x4 SIMDE_FUNCTION_ATTRIBUTES void -simde_vst3q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x2x3_t val, const int lane){ +simde_vst3q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x2x3_t val, const int lane) { //SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) SIMDE_CONSTIFY_2_NO_RESULT_(vst3q_lane_f64, HEDLEY_UNREACHABLE(), lane, ptr, val); @@ -586,7 +588,8 @@ simde_vst3_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16 } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst3_lane_bf16 #define vst3_lane_bf16(a, b, c) simde_vst3_lane_bf16((a), (b), (c)) #endif @@ -605,7 +608,8 @@ simde_vst3q_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat1 } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst3q_lane_bf16 #define vst3q_lane_bf16(a, b, c) simde_vst3q_lane_bf16((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/st4.h b/thirdparty/simde/arm/neon/st4.h index 35e3fe6f8..475f745a7 100644 --- a/thirdparty/simde/arm/neon/st4.h +++ b/thirdparty/simde/arm/neon/st4.h @@ -62,7 +62,8 @@ simde_vst4_f16(simde_float16_t *ptr, simde_float16x4x4_t val) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst4_f16 #define vst4_f16(a, b) simde_vst4_f16((a), (b)) #endif @@ -402,7 +403,8 @@ simde_vst4q_f16(simde_float16_t *ptr, simde_float16x8x4_t val) { #endif #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst4q_f16 #define vst4q_f16(a, b) simde_vst4q_f16((a), (b)) #endif @@ -887,7 +889,8 @@ simde_vst4_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4x4_t val) { simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst4_bf16 #define vst4_bf16(a, b) simde_vst4_bf16((a), (b)) #endif @@ -907,7 +910,8 @@ simde_vst4q_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8x4_t val) { simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst4q_bf16 #define vst4q_bf16(a, b) simde_vst4q_bf16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/st4_lane.h b/thirdparty/simde/arm/neon/st4_lane.h index 4f8a5b655..b85078379 100644 --- a/thirdparty/simde/arm/neon/st4_lane.h +++ b/thirdparty/simde/arm/neon/st4_lane.h @@ -204,7 +204,8 @@ simde_vst4_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4x #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) #define simde_vst4_lane_f16(a, b, c) vst4_lane_f16((a), (b), (c)) #endif -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst4_lane_f16 #define vst4_lane_f16(a, b, c) simde_vst4_lane_f16((a), (b), (c)) #endif @@ -414,7 +415,8 @@ simde_vst4q_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x8 } #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vst4q_lane_f16 #define vst4q_lane_f16(a, b, c) simde_vst4q_lane_f16((a), (b), (c)) #endif @@ -587,7 +589,8 @@ simde_vst4_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16 } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst4_lane_bf16 #define vst4_lane_bf16(a, b, c) simde_vst4_lane_bf16((a), (b), (c)) #endif @@ -606,7 +609,8 @@ simde_vst4q_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat1 } #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_BF16)) #undef vst4q_lane_bf16 #define vst4q_lane_bf16(a, b, c) simde_vst4q_lane_bf16((a), (b), (c)) #endif diff --git a/thirdparty/simde/arm/neon/sub.h b/thirdparty/simde/arm/neon/sub.h index d54095044..2f7022c09 100644 --- a/thirdparty/simde/arm/neon/sub.h +++ b/thirdparty/simde/arm/neon/sub.h @@ -45,7 +45,8 @@ simde_vsubh_f16(simde_float16_t a, simde_float16_t b) { return simde_float16_from_float32(af - bf); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vsubh_f16 #define vsubh_f16(a, b) simde_vsubh_f16((a), (b)) #endif @@ -97,7 +98,8 @@ simde_vsub_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vsub_f16 #define vsub_f16(a, b) simde_vsub_f16((a), (b)) #endif @@ -415,7 +417,8 @@ simde_vsubq_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vsubq_f16 #define vsubq_f16(a, b) simde_vsubq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/sudot_lane.h b/thirdparty/simde/arm/neon/sudot_lane.h index 6d3844bce..5137780ce 100644 --- a/thirdparty/simde/arm/neon/sudot_lane.h +++ b/thirdparty/simde/arm/neon/sudot_lane.h @@ -60,7 +60,8 @@ simde_vsudot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_uint8x8_t b, co #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) #define simde_vsudot_lane_s32(r, a, b, lane) vsudot_lane_s32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vsudot_lane_s32 #define vsudot_lane_s32(r, a, b, lane) simde_vsudot_lane_s32((r), (a), (b), (lane)) #endif @@ -92,7 +93,8 @@ simde_vsudot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_uint8x16_t b, #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) #define simde_vsudot_laneq_s32(r, a, b, lane) vsudot_laneq_s32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vsudot_laneq_s32 #define vsudot_laneq_s32(r, a, b, lane) simde_vsudot_laneq_s32((r), (a), (b), (lane)) #endif @@ -123,7 +125,8 @@ simde_vsudotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_uint8x16_t b #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) #define simde_vsudotq_laneq_s32(r, a, b, lane) vsudotq_laneq_s32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vsudotq_laneq_s32 #define vsudotq_laneq_s32(r, a, b, lane) simde_vsudotq_laneq_s32((r), (a), (b), (lane)) #endif @@ -154,7 +157,8 @@ simde_vsudotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_uint8x8_t b, #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) #define simde_vsudotq_lane_s32(r, a, b, lane) vsudotq_lane_s32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vsudotq_lane_s32 #define vsudotq_lane_s32(r, a, b, lane) simde_vsudotq_lane_s32((r), (a), (b), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/trn.h b/thirdparty/simde/arm/neon/trn.h index 8dfdf0efb..3d5149eaa 100644 --- a/thirdparty/simde/arm/neon/trn.h +++ b/thirdparty/simde/arm/neon/trn.h @@ -47,7 +47,8 @@ simde_vtrn_f16(simde_float16x4_t a, simde_float16x4_t b) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vtrn_f16 #define vtrn_f16(a, b) simde_vtrn_f16((a), (b)) #endif @@ -167,7 +168,8 @@ simde_vtrnq_f16(simde_float16x8_t a, simde_float16x8_t b) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vtrnq_f16 #define vtrnq_f16(a, b) simde_vtrnq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/trn1.h b/thirdparty/simde/arm/neon/trn1.h index 47d35ceaf..ba01c1ef5 100644 --- a/thirdparty/simde/arm/neon/trn1.h +++ b/thirdparty/simde/arm/neon/trn1.h @@ -57,7 +57,8 @@ simde_vtrn1_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vtrn1_f16 #define vtrn1_f16(a, b) simde_vtrn1_f16((a), (b)) #endif @@ -273,7 +274,8 @@ simde_vtrn1q_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vtrn1q_f16 #define vtrn1q_f16(a, b) simde_vtrn1q_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/trn2.h b/thirdparty/simde/arm/neon/trn2.h index e2d063040..ad6f1fba1 100644 --- a/thirdparty/simde/arm/neon/trn2.h +++ b/thirdparty/simde/arm/neon/trn2.h @@ -57,7 +57,8 @@ simde_vtrn2_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vtrn2_f16 #define vtrn2_f16(a, b) simde_vtrn2_f16((a), (b)) #endif @@ -273,7 +274,8 @@ simde_vtrn2q_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vtrn2q_f16 #define vtrn2q_f16(a, b) simde_vtrn2q_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/types.h b/thirdparty/simde/arm/neon/types.h index 5a5954ac9..191c19c1c 100644 --- a/thirdparty/simde/arm/neon/types.h +++ b/thirdparty/simde/arm/neon/types.h @@ -394,6 +394,10 @@ typedef union { __m128 m128; #endif + #if defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + __m128h m128h; + #endif + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int32x4_t neon; #endif @@ -626,11 +630,7 @@ typedef union { typedef poly64x2x3_t simde_poly64x2x3_t; typedef poly64x1x4_t simde_poly64x1x4_t; typedef poly64x2x4_t simde_poly64x2x4_t; - #if defined(SIMDE_ARCH_ARM_CRYPTO) - typedef poly128_t simde_poly128_t; - #else - #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT - #endif + typedef poly128_t simde_poly128_t; #else #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT #endif @@ -1287,7 +1287,13 @@ typedef union { #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) - typedef simde_float16_t float16_t; + typedef simde_float16_t float16_t; + typedef simde_float16x4x2_t float16x4x2_t; + typedef simde_float16x4x3_t float16x4x3_t; + typedef simde_float16x4x4_t float16x4x4_t; + typedef simde_float16x8x2_t float16x8x2_t; + typedef simde_float16x8x3_t float16x8x3_t; + typedef simde_float16x8x4_t float16x8x4_t; #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) typedef simde_float32_t float32_t; @@ -1460,6 +1466,10 @@ typedef union { SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint64x2_from_m128i, simde_uint64x2_t, __m128i) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float64x2_from_m128d, simde_float64x2_t, __m128d) #endif +#if defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float16x8_to_m128h, __m128h, simde_float16x8_t) + SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float16x8_from_m128h, simde_float16x8_t, __m128h) +#endif #if defined(SIMDE_WASM_SIMD128_NATIVE) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_int8x16_to_v128, v128_t, simde_int8x16_t) diff --git a/thirdparty/simde/arm/neon/usdot.h b/thirdparty/simde/arm/neon/usdot.h index d32769479..40adc65c1 100644 --- a/thirdparty/simde/arm/neon/usdot.h +++ b/thirdparty/simde/arm/neon/usdot.h @@ -56,7 +56,8 @@ simde_vusdot_s32(simde_int32x2_t r, simde_uint8x8_t a, simde_int8x8_t b) { return simde_vadd_s32(r, simde_int32x2_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vusdot_s32 #define vusdot_s32(r, a, b) simde_vusdot_s32((r), (a), (b)) #endif @@ -82,7 +83,8 @@ simde_vusdotq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b) { return simde_vaddq_s32(r, simde_int32x4_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vusdotq_s32 #define vusdotq_s32(r, a, b) simde_vusdotq_s32((r), (a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/usdot_lane.h b/thirdparty/simde/arm/neon/usdot_lane.h index 6d8de889d..512b685ce 100644 --- a/thirdparty/simde/arm/neon/usdot_lane.h +++ b/thirdparty/simde/arm/neon/usdot_lane.h @@ -57,10 +57,11 @@ simde_vusdot_lane_s32(simde_int32x2_t r, simde_uint8x8_t a, simde_int8x8_t b, co return result; } -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) #define simde_vusdot_lane_s32(r, a, b, lane) vusdot_lane_s32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vusdot_lane_s32 #define vusdot_lane_s32(r, a, b, lane) simde_vusdot_lane_s32((r), (a), (b), (lane)) #endif @@ -89,10 +90,11 @@ simde_vusdot_laneq_s32(simde_int32x2_t r, simde_uint8x8_t a, simde_int8x16_t b, return result; } -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) #define simde_vusdot_laneq_s32(r, a, b, lane) vusdot_laneq_s32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vusdot_laneq_s32 #define vusdot_laneq_s32(r, a, b, lane) simde_vusdot_laneq_s32((r), (a), (b), (lane)) #endif @@ -120,10 +122,11 @@ simde_vusdotq_laneq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b result = simde_int32x4_from_private(r_); return result; } -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) #define simde_vusdotq_laneq_s32(r, a, b, lane) vusdotq_laneq_s32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vusdotq_laneq_s32 #define vusdotq_laneq_s32(r, a, b, lane) simde_vusdotq_laneq_s32((r), (a), (b), (lane)) #endif @@ -151,10 +154,11 @@ simde_vusdotq_lane_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x8_t b, result = simde_int32x4_from_private(r_); return result; } -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) #define simde_vusdotq_lane_s32(r, a, b, lane) vusdotq_lane_s32((r), (a), (b), (lane)) #endif -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARCH_ARM_MATMUL_INT8)) #undef vusdotq_lane_s32 #define vusdotq_lane_s32(r, a, b, lane) simde_vusdotq_lane_s32((r), (a), (b), (lane)) #endif diff --git a/thirdparty/simde/arm/neon/uzp.h b/thirdparty/simde/arm/neon/uzp.h index 2788a6f53..439dfe65c 100644 --- a/thirdparty/simde/arm/neon/uzp.h +++ b/thirdparty/simde/arm/neon/uzp.h @@ -47,7 +47,8 @@ simde_vuzp_f16(simde_float16x4_t a, simde_float16x4_t b) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vuzp_f16 #define vuzp_f16(a, b) simde_vuzp_f16((a), (b)) #endif @@ -167,7 +168,8 @@ simde_vuzpq_f16(simde_float16x8_t a, simde_float16x8_t b) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vuzpq_f16 #define vuzpq_f16(a, b) simde_vuzpq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/uzp1.h b/thirdparty/simde/arm/neon/uzp1.h index fbc41218e..0ef6b33cc 100644 --- a/thirdparty/simde/arm/neon/uzp1.h +++ b/thirdparty/simde/arm/neon/uzp1.h @@ -57,7 +57,8 @@ simde_vuzp1_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vuzp1_f16 #define vuzp1_f16(a, b) simde_vuzp1_f16((a), (b)) #endif @@ -325,7 +326,8 @@ simde_vuzp1q_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vuzp1q_f16 #define vuzp1q_f16(a, b) simde_vuzp1q_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/uzp2.h b/thirdparty/simde/arm/neon/uzp2.h index b2b409104..7692a7d66 100644 --- a/thirdparty/simde/arm/neon/uzp2.h +++ b/thirdparty/simde/arm/neon/uzp2.h @@ -57,7 +57,8 @@ simde_vuzp2_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vuzp2_f16 #define vuzp2_f16(a, b) simde_vuzp2_f16((a), (b)) #endif @@ -325,7 +326,8 @@ simde_vuzp2q_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vuzp2q_f16 #define vuzp2q_f16(a, b) simde_vuzp2q_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/xar.h b/thirdparty/simde/arm/neon/xar.h index b7b2c5836..3c9bc2264 100644 --- a/thirdparty/simde/arm/neon/xar.h +++ b/thirdparty/simde/arm/neon/xar.h @@ -44,12 +44,12 @@ simde_vxarq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int d) SIMDE_VECTORIZE for (size_t i=0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = ((t.values[i] >> d) | (t.values[i] << (64 - d))); + r_.values[i] = ((t.values[i] >> d) | (t.values[i] << ((64 - d) & 63 ))); } return simde_uint64x2_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) && !defined(SIMDE_BUG_GCC_123584) #define simde_vxarq_u64(a, b, d) vxarq_u64((a), (b), (d)) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) diff --git a/thirdparty/simde/arm/neon/zip.h b/thirdparty/simde/arm/neon/zip.h index a7921cc62..d0a8d294c 100644 --- a/thirdparty/simde/arm/neon/zip.h +++ b/thirdparty/simde/arm/neon/zip.h @@ -47,7 +47,8 @@ simde_vzip_f16(simde_float16x4_t a, simde_float16x4_t b) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vzip_f16 #define vzip_f16(a, b) simde_vzip_f16((a), (b)) #endif @@ -167,7 +168,8 @@ simde_vzipq_f16(simde_float16x8_t a, simde_float16x8_t b) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vzipq_f16 #define vzipq_f16(a, b) simde_vzipq_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/zip1.h b/thirdparty/simde/arm/neon/zip1.h index ea7794359..7eaae8bc1 100644 --- a/thirdparty/simde/arm/neon/zip1.h +++ b/thirdparty/simde/arm/neon/zip1.h @@ -56,7 +56,8 @@ simde_vzip1_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vzip1_f16 #define vzip1_f16(a, b) simde_vzip1_f16((a), (b)) #endif @@ -327,7 +328,8 @@ simde_vzip1q_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vzip1q_f16 #define vzip1q_f16(a, b) simde_vzip1q_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/neon/zip2.h b/thirdparty/simde/arm/neon/zip2.h index 0cd2150ac..1c195b83f 100644 --- a/thirdparty/simde/arm/neon/zip2.h +++ b/thirdparty/simde/arm/neon/zip2.h @@ -56,7 +56,8 @@ simde_vzip2_f16(simde_float16x4_t a, simde_float16x4_t b) { return simde_float16x4_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vzip2_f16 #define vzip2_f16(a, b) simde_vzip2_f16((a), (b)) #endif @@ -306,7 +307,8 @@ simde_vzip2q_f16(simde_float16x8_t a, simde_float16x8_t b) { return simde_float16x8_from_private(r_); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && \ + !defined(SIMDE_ARM_NEON_FP16)) #undef vzip2q_f16 #define vzip2q_f16(a, b) simde_vzip2q_f16((a), (b)) #endif diff --git a/thirdparty/simde/arm/sve/types.h b/thirdparty/simde/arm/sve/types.h index f0579d96c..733efcb5e 100644 --- a/thirdparty/simde/arm/sve/types.h +++ b/thirdparty/simde/arm/sve/types.h @@ -309,7 +309,7 @@ SIMDE_BEGIN_DECLS_ __m128i m128i[(SIMDE_ARM_SVE_VECTOR_SIZE / 8) / sizeof(__m128i)]; #endif - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_NEON_FP16) float16x8_t neon; #endif diff --git a/thirdparty/simde/hedley.h b/thirdparty/simde/hedley.h index f064f3f4c..3f39bed34 100644 --- a/thirdparty/simde/hedley.h +++ b/thirdparty/simde/hedley.h @@ -185,7 +185,11 @@ #endif #if defined(__EMSCRIPTEN__) # include -# define HEDLEY_EMSCRIPTEN_VERSION HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) +# if defined( __EMSCRIPTEN_MAJOR__) +# define HEDLEY_EMSCRIPTEN_VERSION HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_MAJOR__, __EMSCRIPTEN_MINOR__, __EMSCRIPTEN_TINY__) +# else +# define HEDLEY_EMSCRIPTEN_VERSION HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) +# endif #endif #if defined(HEDLEY_EMSCRIPTEN_VERSION_CHECK) diff --git a/thirdparty/simde/mips/msa/adds.h b/thirdparty/simde/mips/msa/adds.h index e610d482a..2b7efc58f 100644 --- a/thirdparty/simde/mips/msa/adds.h +++ b/thirdparty/simde/mips/msa/adds.h @@ -356,8 +356,8 @@ simde_msa_adds_u_w(simde_v4u32 a, simde_v4u32 b) { r_; #if defined(SIMDE_X86_SSE4_1_NATIVE) - #if defined(__AVX512VL__) - __m128i notb = _mm_ternarylogic_epi32(b, b, b, 0x0f); + #if defined(SIMDE_ARCH_X86_AVX512VL) + __m128i notb = _mm_ternarylogic_epi32(b_.m128i, b_.m128i, b_.m128i, 0x0f); #else __m128i notb = _mm_xor_si128(b_.m128i, _mm_set1_epi32(~INT32_C(0))); #endif diff --git a/thirdparty/simde/mips/msa/st.h b/thirdparty/simde/mips/msa/st.h index 9565c84ee..c41c832cc 100644 --- a/thirdparty/simde/mips/msa/st.h +++ b/thirdparty/simde/mips/msa/st.h @@ -40,7 +40,7 @@ simde_msa_st_b(simde_v16i8 a, void * rs, const int s10) simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } #if defined(SIMDE_MIPS_MSA_NATIVE) - #define simde_msa_st_b(a, rs, s10) __msa_st_b((a), (rs), (s10)); + #define simde_msa_st_b(a, rs, s10) __msa_st_b((a), (rs), (s10)) #endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_b @@ -55,7 +55,7 @@ simde_msa_st_h(simde_v8i16 a, void * rs, const int s10) simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } #if defined(SIMDE_MIPS_MSA_NATIVE) - #define simde_msa_st_h(a, rs, s10) __msa_st_h((a), (rs), (s10)); + #define simde_msa_st_h(a, rs, s10) __msa_st_h((a), (rs), (s10)) #endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_h @@ -70,7 +70,7 @@ simde_msa_st_w(simde_v4i32 a, void * rs, const int s10) simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } #if defined(SIMDE_MIPS_MSA_NATIVE) - #define simde_msa_st_w(a, rs, s10) __msa_st_w((a), (rs), (s10)); + #define simde_msa_st_w(a, rs, s10) __msa_st_w((a), (rs), (s10)) #endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_w @@ -85,7 +85,7 @@ simde_msa_st_d(simde_v2i64 a, void * rs, const int s10) simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } #if defined(SIMDE_MIPS_MSA_NATIVE) - #define simde_msa_st_d(a, rs, s10) __msa_st_d((a), (rs), (s10)); + #define simde_msa_st_d(a, rs, s10) __msa_st_d((a), (rs), (s10)) #endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_d diff --git a/thirdparty/simde/simde-aes.h b/thirdparty/simde/simde-aes.h index ea3ef5aa4..ee1ad7b7e 100644 --- a/thirdparty/simde/simde-aes.h +++ b/thirdparty/simde/simde-aes.h @@ -35,6 +35,10 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS #if !(defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) && \ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) +#if HEDLEY_GCC_VERSION_CHECK(13,2,0) +_Pragma("GCC diagnostic ignored \"-Wunused-variable\"") +#endif + /* * Number of columns (32-bit words) comprising the State. For this * standard, Nb = 4. diff --git a/thirdparty/simde/simde-arch.h b/thirdparty/simde/simde-arch.h index 2f12e042a..74830e804 100644 --- a/thirdparty/simde/simde-arch.h +++ b/thirdparty/simde/simde-arch.h @@ -166,9 +166,15 @@ #if defined(__ARM_FEATURE_SVE) && __ARM_FEATURE_SVE # define SIMDE_ARCH_ARM_SVE #endif +#if defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2 +# define SIMDE_ARCH_ARM_SVE2 +#endif #if defined(__ARM_FEATURE_QRDMX) && __ARM_FEATURE_QRDMX # define SIMDE_ARCH_ARM_QRDMX #endif +#if defined(__ARM_FEATURE_SVE2_BITPERM) && __ARM_FEATURE_SVE2_BITPERM +# define SIMDE_ARCH_ARM_SVE2_BITPERM +#endif /* Blackfin */ @@ -387,6 +393,12 @@ # if defined(__AES__) # define SIMDE_ARCH_X86_AES 1 # endif +# if defined(__BMI__) +# define SIMDE_ARCH_X86_BMI1 1 +# endif +# if defined(__BMI2__) +# define SIMDE_ARCH_X86_BMI2 1 +# endif #endif /* Itanium @@ -541,7 +553,7 @@ #if defined(__riscv_zve64d) # define SIMDE_ARCH_RISCV_ZVE64D 1 #endif -#if defined(__riscv_v) +#if defined(__riscv_v) || (defined(__riscv_zve64d) && defined(__riscv_zvl128b)) # define SIMDE_ARCH_RISCV_V 1 #endif #if defined(__riscv_zvfh) diff --git a/thirdparty/simde/simde-common.h b/thirdparty/simde/simde-common.h index 212bff620..79cc97e4b 100644 --- a/thirdparty/simde/simde-common.h +++ b/thirdparty/simde/simde-common.h @@ -33,7 +33,7 @@ #define SIMDE_VERSION_MAJOR 0 #define SIMDE_VERSION_MINOR 8 -#define SIMDE_VERSION_MICRO 3 +#define SIMDE_VERSION_MICRO 4 #define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO) // Also update meson.build in the root directory of the repository @@ -603,6 +603,8 @@ typedef SIMDE_POLY64_TYPE simde_poly64; #if defined(SIMDE_POLY128_TYPE) # undef SIMDE_POLY128_TYPE #endif +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ // due to the __int128 below #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) # define SIMDE_POLY128_TYPE poly128_t # define SIMDE_POLY128_C(value) value @@ -613,7 +615,9 @@ typedef SIMDE_POLY64_TYPE simde_poly64; # define SIMDE_POLY128_TYPE uint64_t # define SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE 1 #endif + typedef SIMDE_POLY128_TYPE simde_poly128; +HEDLEY_DIAGNOSTIC_POP #if defined(__cplusplus) typedef bool simde_bool; @@ -676,17 +680,17 @@ typedef SIMDE_POLY128_TYPE simde_poly128; #endif /* Try to deal with environments without a standard library. */ -#if !defined(simde_memcpy) +#if !defined(simde_memcpy) && !defined(SIMDE_NO_STDLIB) #if HEDLEY_HAS_BUILTIN(__builtin_memcpy) #define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n) #endif #endif -#if !defined(simde_memset) +#if !defined(simde_memset) && !defined(SIMDE_NO_STDLIB) #if HEDLEY_HAS_BUILTIN(__builtin_memset) #define simde_memset(s, c, n) __builtin_memset(s, c, n) #endif #endif -#if !defined(simde_memcmp) +#if !defined(simde_memcmp) && !defined(SIMDE_NO_STDLIB) #if HEDLEY_HAS_BUILTIN(__builtin_memcmp) #define simde_memcmp(s1, s2, n) __builtin_memcmp(s1, s2, n) #endif @@ -703,7 +707,7 @@ typedef SIMDE_POLY128_TYPE simde_poly128; #endif #endif - #if !defined(SIMDE_NO_STRING_H) + #if !defined(SIMDE_NO_STRING_H) && !defined(SIMDE_NO_STDLIB) #include #if !defined(simde_memcpy) #define simde_memcpy(dest, src, n) memcpy(dest, src, n) @@ -726,7 +730,7 @@ typedef SIMDE_POLY128_TYPE simde_poly128; void simde_memcpy_(void* dest, const void* src, size_t len) { char* dest_ = HEDLEY_STATIC_CAST(char*, dest); - char* src_ = HEDLEY_STATIC_CAST(const char*, src); + const char* src_ = HEDLEY_STATIC_CAST(const char*, src); for (size_t i = 0 ; i < len ; i++) { dest_[i] = src_[i]; } @@ -741,19 +745,19 @@ typedef SIMDE_POLY128_TYPE simde_poly128; char* s_ = HEDLEY_STATIC_CAST(char*, s); char c_ = HEDLEY_STATIC_CAST(char, c); for (size_t i = 0 ; i < len ; i++) { - s_[i] = c_[i]; + s_[i] = c_; } } #define simde_memset(s, c, n) simde_memset_(s, c, n) #endif #if !defined(simde_memcmp) - SIMDE_FUCTION_ATTRIBUTES + SIMDE_FUNCTION_ATTRIBUTES int simde_memcmp_(const void *s1, const void *s2, size_t n) { - unsigned char* s1_ = HEDLEY_STATIC_CAST(unsigned char*, s1); - unsigned char* s2_ = HEDLEY_STATIC_CAST(unsigned char*, s2); - for (size_t i = 0 ; i < len ; i++) { + const unsigned char* s1_ = HEDLEY_STATIC_CAST(const unsigned char*, s1); + const unsigned char* s2_ = HEDLEY_STATIC_CAST(const unsigned char*, s2); + for (size_t i = 0 ; i < n ; i++) { if (s1_[i] != s2_[i]) { return (int) (s1_[i] - s2_[i]); } @@ -1017,18 +1021,25 @@ HEDLEY_DIAGNOSTIC_POP # if !HEDLEY_GCC_VERSION_CHECK(11,2,0) # define SIMDE_BUG_GCC_95483 # endif -# if defined(__OPTIMIZE__) +# if defined(__OPTIMIZE__) && !HEDLEY_GCC_VERSION_CHECK(15,0,0) # define SIMDE_BUG_GCC_100927 # endif # if !(HEDLEY_GCC_VERSION_CHECK(10,3,0)) # define SIMDE_BUG_GCC_98521 # endif -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,4,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_94488 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,1,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_REV_264019 +# elif defined(SIMDE_ARCH_AARCH64) +# if !HEDLEY_GCC_VERSION_CHECK(9,4,0) +# define SIMDE_BUG_GCC_94488 +# endif +# if !HEDLEY_GCC_VERSION_CHECK(9,1,0) +# define SIMDE_BUG_GCC_REV_264019 +# endif +# if HEDLEY_GCC_VERSION_CHECK(15,0,0) && !HEDLEY_GCC_VERSION_CHECK(16,1,0) +# define SIMDE_BUG_GCC_123584 +# endif +# if !HEDLEY_GCC_VERSION_CHECK(10,2,0) && !defined(__OPTIMIZE__) +# define SIMDE_BUG_GCC_96174 +# endif # endif # if (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && !defined(SIMDE_ARCH_AARCH64)) || (!defined(SIMDE_ARCH_AARCH64) && defined(SIMDE_ARCH_ARM)) # define SIMDE_BUG_GCC_REV_260989 @@ -1046,29 +1057,33 @@ HEDLEY_DIAGNOSTIC_POP # endif # if defined(SIMDE_ARCH_POWER) # define SIMDE_BUG_GCC_95227 -# define SIMDE_BUG_GCC_95782 +# if !HEDLEY_GCC_VERSION_CHECK(13,0,0) +# define SIMDE_BUG_GCC_95782 +# endif # if !HEDLEY_GCC_VERSION_CHECK(12,0,0) # define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS # endif -# endif -# if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) -# if !HEDLEY_GCC_VERSION_CHECK(10,2,0) && !defined(__OPTIMIZE__) -# define SIMDE_BUG_GCC_96174 -# endif -# endif -# if defined(SIMDE_ARCH_ZARCH) +# elif defined(SIMDE_ARCH_ZARCH) # define SIMDE_BUG_GCC_95782 # if HEDLEY_GCC_VERSION_CHECK(10,0,0) # define SIMDE_BUG_GCC_101614 # endif -# endif -# if defined(SIMDE_ARCH_MIPS_MSA) +# elif defined(SIMDE_ARCH_MIPS_MSA) # define SIMDE_BUG_GCC_97248 # if !HEDLEY_GCC_VERSION_CHECK(12,1,0) # define SIMDE_BUG_GCC_100760 # define SIMDE_BUG_GCC_100761 # define SIMDE_BUG_GCC_100762 # endif +# elif defined(SIMDE_ARCH_LOONGARCH) +# if HEDLEY_GCC_VERSION_CHECK(16,0,0) +# define SIMDE_BUG_GCC_123807 +# endif +# if defined(SIMDE_ARCH_LOONGARCH) && \ + ((HEDLEY_GCC_VERSION_CHECK(14,0,0) && !HEDLEY_GCC_VERSION_CHECK(14,4,0)) || \ + (HEDLEY_GCC_VERSION_CHECK(15,0,0) && !HEDLEY_GCC_VERSION_CHECK(15,2,0))) +# define SIMDE_BUG_GCC_121064 +# endif # endif # if !defined(__OPTIMIZE__) && !(\ HEDLEY_GCC_VERSION_CHECK(11,4,0) \ @@ -1076,12 +1091,17 @@ HEDLEY_DIAGNOSTIC_POP || (HEDLEY_GCC_VERSION_CHECK(9,5,0) && !(HEDLEY_GCC_VERSION_CHECK(10,0,0)))) # define SIMDE_BUG_GCC_105339 # endif +# if defined(SIMDE_ARCH_LOONGARCH) +# define SIMDE_BUG_GCC_123766 +# endif # elif defined(__clang__) # if defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_CLANG_48257 // https://github.com/llvm/llvm-project/issues/47601 # define SIMDE_BUG_CLANG_71362 // https://github.com/llvm/llvm-project/issues/71362 # define SIMDE_BUG_CLANG_71365 // https://github.com/llvm/llvm-project/issues/71365 -# define SIMDE_BUG_CLANG_71751 // https://github.com/llvm/llvm-project/issues/71751 +# if !SIMDE_DETECT_CLANG_VERSION_CHECK(19,0,0) +# define SIMDE_BUG_CLANG_71751 +# endif # if !SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0) # define SIMDE_BUG_CLANG_45541 # endif @@ -1101,8 +1121,8 @@ HEDLEY_DIAGNOSTIC_POP # if !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) # define SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES # endif -# if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NATIVE) -# define SIMDE_BUG_CLANG_71763 // https://github.com/llvm/llvm-project/issues/71763 +# if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !SIMDE_DETECT_CLANG_VERSION_CHECK(19,0,0) +# define SIMDE_BUG_CLANG_71763 # endif # endif # if defined(SIMDE_ARCH_POWER) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) @@ -1147,6 +1167,9 @@ HEDLEY_DIAGNOSTIC_POP # if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) # define SIMDE_BUG_CLANG_44589 # endif +# if SIMDE_DETECT_CLANG_VERSION_CHECK(21,1,1) && SIMDE_DETECT_CLANG_VERSION_NOT(22,1,0) +# define SIMDE_BUG_CLANG_179057 +# endif # define SIMDE_BUG_CLANG_48673 // https://github.com/llvm/llvm-project/issues/48017 # endif # define SIMDE_BUG_CLANG_45959 // https://github.com/llvm/llvm-project/issues/45304 @@ -1200,7 +1223,7 @@ HEDLEY_DIAGNOSTIC_POP #endif /* Initial support for RISCV V extensions based on ZVE64D. */ -#if defined(SIMDE_ARCH_RISCV_ZVE64D) && SIMDE_NATURAL_VECTOR_SIZE >= 64 +#if defined(SIMDE_ARCH_RISCV_ZVE64D) && SIMDE_NATURAL_VECTOR_SIZE >= 64 && defined(__riscv_v_fixed_vlen) #define RVV_FIXED_TYPE_DEF(name, lmul) \ typedef vint8##name##_t fixed_vint8##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ typedef vint16##name##_t fixed_vint16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ diff --git a/thirdparty/simde/simde-detect-clang.h b/thirdparty/simde/simde-detect-clang.h index 7326f02db..df1a80352 100644 --- a/thirdparty/simde/simde-detect-clang.h +++ b/thirdparty/simde/simde-detect-clang.h @@ -60,7 +60,21 @@ */ #if defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) -# if __has_warning("-Wmissing-designated-field-initializers") +# if __has_warning("-Wlifetime-safety") +# define SIMDE_DETECT_CLANG_VERSION 230000 +# elif __has_warning("-Wunsafe-buffer-usage-in-format-attr-call") +# define SIMDE_DETECT_CLANG_VERSION 220100 +# elif __has_builtin(__builtin_elementwise_fshl) +# define SIMDE_DETECT_CLANG_VERSION 220000 +# elif __has_warning("-Wdefault-const-init-var") || __has_builtin(__builtin_structured_binding_size) +# if __clang_major__ == 21 && __clang_minor__ >= 1 && __clang_patchlevel__ >= 1 +# define SIMDE_DETECT_CLANG_VERSION 210101 // for SIMDE_BUG_CLANG_179057 +# else +# define SIMDE_DETECT_CLANG_VERSION 210000 +# endif +# elif __has_warning("-Warray-compare-cxx26") +# define SIMDE_DETECT_CLANG_VERSION 200000 +# elif __has_warning("-Wmissing-designated-field-initializers") # define SIMDE_DETECT_CLANG_VERSION 190000 # elif __has_warning("-Woverriding-option") # define SIMDE_DETECT_CLANG_VERSION 180000 @@ -121,8 +135,8 @@ * such as pragmas to disable a specific warning. */ #if defined(SIMDE_DETECT_CLANG_VERSION) -# define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION >= ((major * 10000) + (minor * 1000) + (revision))) -# define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION < ((major * 10000) + (minor * 1000) + (revision))) +# define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION >= ((major * 10000) + (minor * 100) + (revision))) +# define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION < ((major * 10000) + (minor * 100) + (revision))) #else # define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (0) # define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (0) diff --git a/thirdparty/simde/simde-diagnostic.h b/thirdparty/simde/simde-diagnostic.h index 6c7d2e732..7fe033ec3 100644 --- a/thirdparty/simde/simde-diagnostic.h +++ b/thirdparty/simde/simde-diagnostic.h @@ -84,6 +84,8 @@ * functions are inlined and don't generate ABI. */ #if HEDLEY_GCC_VERSION_CHECK(7,0,0) #define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ _Pragma("GCC diagnostic ignored \"-Wpsabi\"") +#elif SIMDE_DETECT_CLANG_VERSION_CHECK(11, 0, 0) + #define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ _Pragma("clang diagnostic ignored \"-Wpsabi\"") #else #define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ #endif @@ -408,9 +410,9 @@ /* This is a false positive from GCC in a few places. */ #if HEDLEY_GCC_VERSION_CHECK(4,7,0) - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") + #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") #else - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ + #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ #endif #if defined(SIMDE_ENABLE_NATIVE_ALIASES) diff --git a/thirdparty/simde/simde-f16.h b/thirdparty/simde/simde-f16.h index 2f7c4c513..01faf0726 100644 --- a/thirdparty/simde/simde-f16.h +++ b/thirdparty/simde/simde-f16.h @@ -68,7 +68,8 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_FLOAT16_API) #if defined(__ARM_FP16_FORMAT_IEEE) && (defined(SIMDE_ARM_NEON_FP16) || defined(__ARM_FP16_ARGS)) #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16 - #elif !defined(__EMSCRIPTEN__) && !(defined(__clang__) && defined(SIMDE_ARCH_POWER)) && \ + #elif !((defined(__EMSCRIPTEN__) || defined(__wasi__)) && !defined(__wasm_fp16__)) && \ + !(defined(__clang__) && defined(SIMDE_ARCH_POWER)) && \ !(defined(HEDLEY_MSVC_VERSION) && defined(__clang__)) && \ !(defined(SIMDE_ARCH_MIPS) && defined(__clang__)) && \ !(defined(SIMDE_ARCH_ZARCH) && defined(__clang__)) && \ @@ -98,7 +99,10 @@ SIMDE_BEGIN_DECLS_ #endif #if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16 + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ // due to the _Float16 below typedef _Float16 simde_float16; + HEDLEY_DIAGNOSTIC_POP #define SIMDE_FLOAT16_IS_SCALAR 1 #if !defined(__cplusplus) #define SIMDE_FLOAT16_C(value) value##f16 @@ -171,20 +175,29 @@ SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint16_as_float16, simde_float16, u #endif #endif +#define SIMDE_F16_ROUND_TO_NEAREST 0x00 +#define SIMDE_F16_ROUND_TO_NEG_INF 0x01 +#define SIMDE_F16_ROUND_TO_POS_INF 0x02 +#define SIMDE_F16_ROUND_TO_ZERO 0x03 + /* Conversion -- convert between single-precision and half-precision * floats. */ static HEDLEY_ALWAYS_INLINE HEDLEY_CONST simde_float16 -simde_float16_from_float32 (simde_float32 value) { +simde_x_float16_from_float32 (simde_float32 value, int round) { simde_float16 res; - #if \ - (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16) || \ + #if (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16) || \ (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) + if (round == SIMDE_F16_ROUND_TO_NEAREST) { res = HEDLEY_STATIC_CAST(simde_float16, value); + } else #elif (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI) + if (round == SIMDE_F16_ROUND_TO_NEAREST) { res.value = HEDLEY_STATIC_CAST(__fp16, value); - #else + } else + #endif + { /* This code is CC0, based heavily on code by Fabian Giesen. */ uint32_t f32u = simde_float32_as_uint32(value); static const uint32_t f32u_infty = UINT32_C(255) << 23; @@ -202,23 +215,59 @@ simde_float16_from_float32 (simde_float32 value) { * operands are below 0x80000000 (we clear the sign bit). */ if (f32u > f16u_max) { /* result is Inf or NaN (all exponent bits set) */ - f16u = (f32u > f32u_infty) ? UINT32_C(0x7e00) : UINT32_C(0x7c00); /* NaN->qNaN and Inf->Inf */ + f16u = (f32u > f32u_infty) ? UINT32_C(0x7e00) : /* NaN->qNaN */ + (f32u == f32u_infty) ? UINT32_C(0x7c00) : /* Inf->Inf */ + (round == SIMDE_F16_ROUND_TO_ZERO) || + (round == SIMDE_F16_ROUND_TO_NEG_INF && !sign) || + (round == SIMDE_F16_ROUND_TO_POS_INF && sign) ? + UINT32_C(0x7bff) : /* max f16 */ + UINT32_C(0x7c00); } else { /* (De)normalized number or zero */ if (f32u < (UINT32_C(113) << 23)) { /* resulting FP16 is subnormal or zero */ - /* use a magic value to align our 10 mantissa bits at the bottom of - * the float. as long as FP addition is round-to-nearest-even this - * just works. */ - f32u = simde_float32_as_uint32(simde_uint32_as_float32(f32u) + simde_uint32_as_float32(denorm_magic)); - - /* and one integer subtract of the bias later, we have our final float! */ - f16u = HEDLEY_STATIC_CAST(uint16_t, f32u - denorm_magic); + if (round == SIMDE_F16_ROUND_TO_NEAREST) { + /* use a magic value to align our 10 mantissa bits at the bottom of + * the float. as long as FP addition is round-to-nearest-even this + * just works. */ + f32u = simde_float32_as_uint32(simde_uint32_as_float32(f32u) + simde_uint32_as_float32(denorm_magic)); + + /* and one integer subtract of the bias later, we have our final float! */ + f16u = HEDLEY_STATIC_CAST(uint16_t, f32u - denorm_magic); + } else { + if (f32u == 0) { + f16u = 0; + } else if (f32u < (UINT32_C(103) << 23)) { /* resulting FP16 is min or zero */ + f16u = (round == SIMDE_F16_ROUND_TO_NEG_INF && sign) || + (round == SIMDE_F16_ROUND_TO_POS_INF && !sign) ? 1 : 0; + } else { /* exp is in 103..112 */ + int32_t shift = 14 + (112 - (f32u >> 23)); /* how many bits to drop */ + uint32_t mant = (f32u & 0x7fffff) | 0x800000; /* implicit one */ + uint32_t dropped = mant & ((UINT32_C(1) << shift) - 1); + f16u = HEDLEY_STATIC_CAST(uint16_t, mant >> shift); + f16u += (round == SIMDE_F16_ROUND_TO_NEG_INF && dropped && sign) || + (round == SIMDE_F16_ROUND_TO_POS_INF && dropped && !sign) ? 1 : 0; + } + } } else { uint32_t mant_odd = (f32u >> 13) & 1; + uint32_t dropped = f32u & UINT32_C(0x1fff); /* update exponent, rounding bias part 1 */ - f32u += (HEDLEY_STATIC_CAST(uint32_t, 15 - 127) << 23) + UINT32_C(0xfff); + f32u += (HEDLEY_STATIC_CAST(uint32_t, 15 - 127) << 23); + /* rounding bias part 2 */ - f32u += mant_odd; + switch (round) { + case SIMDE_F16_ROUND_TO_NEAREST: + f32u += UINT32_C(0xfff) + mant_odd; + break; + case SIMDE_F16_ROUND_TO_NEG_INF: + if (dropped && sign) f32u += UINT32_C(0x2000); + break; + case SIMDE_F16_ROUND_TO_POS_INF: + if (dropped && !sign) f32u += UINT32_C(0x2000); + break; + case SIMDE_F16_ROUND_TO_ZERO: break; + } + /* take the bits! */ f16u = HEDLEY_STATIC_CAST(uint16_t, f32u >> 13); } @@ -226,11 +275,13 @@ simde_float16_from_float32 (simde_float32 value) { f16u |= sign >> 16; res = simde_uint16_as_float16(f16u); - #endif + } return res; } +#define simde_float16_from_float32(x) simde_x_float16_from_float32(x, SIMDE_F16_ROUND_TO_NEAREST) + static HEDLEY_ALWAYS_INLINE HEDLEY_CONST simde_float32 simde_float16_to_float32 (simde_float16 value) { diff --git a/thirdparty/simde/simde-features.h b/thirdparty/simde/simde-features.h index f8d0bb4b7..9100ee7fe 100644 --- a/thirdparty/simde/simde-features.h +++ b/thirdparty/simde/simde-features.h @@ -299,6 +299,18 @@ #endif #endif +#if !defined(SIMDE_X86_BMI1_NATIVE) && !defined(SIMDE_X86_BMI1_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_X86_BMI1) + #define SIMDE_X86_BMI1_NATIVE + #endif +#endif + +#if !defined(SIMDE_X86_BMI2_NATIVE) && !defined(SIMDE_X86_BMI2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_X86_BMI2) + #define SIMDE_X86_BMI2_NATIVE + #endif +#endif + #if defined(HEDLEY_MSVC_VERSION) #pragma warning(push) #pragma warning(disable:4799) @@ -367,7 +379,7 @@ #endif #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) #include - #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #if defined(SIMDE_ARCH_ARM_NEON_FP16) #include #endif #endif @@ -379,8 +391,15 @@ #endif #endif +#if !defined(SIMDE_ARM_SVE2_NATIVE) && !defined(SIMDE_ARM_SVE2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_ARM_SVE2) + #define SIMDE_ARM_SVE2_NATIVE + #include + #endif +#endif + #if !defined(SIMDE_RISCV_V_NATIVE) && !defined(SIMDE_RISCV_V_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_RISCV_V) + #if defined(SIMDE_ARCH_RISCV_V) && defined(__riscv_v_fixed_vlen) #define SIMDE_RISCV_V_NATIVE #endif #endif @@ -680,6 +699,12 @@ #if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE) #define SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_X86_BMI1_NATIVE) + #define SIMDE_X86_BMI1_ENABLE_NATIVE_ALIASES + #endif + #if !defined(SIMDE_X86_BMI2_NATIVE) + #define SIMDE_X86_BMI2_ENABLE_NATIVE_ALIASES + #endif #if !defined(SIMDE_X86_F16C_NATIVE) #define SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES #endif diff --git a/thirdparty/simde/simde-math.h b/thirdparty/simde/simde-math.h index 1dbf3bbae..0a5a4a3fb 100644 --- a/thirdparty/simde/simde-math.h +++ b/thirdparty/simde/simde-math.h @@ -103,10 +103,11 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS * macro libc++ uses. */ #if defined(isnan) || (defined(_LIBCPP_MATH_H) && !defined(_LIBCPP_CMATH)) #define SIMDE_MATH_HAVE_MATH_H - #elif defined(__cplusplus) + #elif defined(__cplusplus) && !defined(HEDLEY_MSVC_VERSION) #define SIMDE_MATH_HAVE_CMATH #endif -#elif defined(__has_include) +#endif +#if defined(__has_include) && !(defined(SIMDE_MATH_HAVE_MATH_H) || defined(SIMDE_MATH_HAVE_CMATH)) #if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include() #define SIMDE_MATH_HAVE_CMATH #include @@ -972,13 +973,18 @@ simde_math_fpclass(double v, const int imm8) { #endif #endif -#if HEDLEY_HAS_BUILTIN(__builtin_exp10) || HEDLEY_GCC_VERSION_CHECK(3,4,0) +/* __builtin_exp10 lowers to exp10() which is a GNU extension available + * only in glibc. Other libcs (musl, OpenBSD, FreeBSD, MinGW, etc.) + * lack the symbol and produce a link error. Fall back to pow(10, v). */ +#if (HEDLEY_HAS_BUILTIN(__builtin_exp10) || HEDLEY_GCC_VERSION_CHECK(3,4,0)) && \ + defined(__GLIBC__) # define simde_math_exp10(v) __builtin_exp10(v) #else # define simde_math_exp10(v) simde_math_pow(10.0, (v)) #endif -#if HEDLEY_HAS_BUILTIN(__builtin_exp10f) || HEDLEY_GCC_VERSION_CHECK(3,4,0) +#if (HEDLEY_HAS_BUILTIN(__builtin_exp10f) || HEDLEY_GCC_VERSION_CHECK(3,4,0)) && \ + defined(__GLIBC__) # define simde_math_exp10f(v) __builtin_exp10f(v) #else # define simde_math_exp10f(v) simde_math_powf(10.0f, (v)) @@ -1265,9 +1271,16 @@ simde_math_fpclass(double v, const int imm8) { #endif #if !defined(simde_math_roundeven) + /* __builtin_roundeven lowers to a roundeven() libm call on targets + * without a native rounding instruction (x86 without SSE4.1, + * powerpc, sparc, i386, etc.). roundeven() is C23 and only + * available in glibc >= 2.25; other libcs (musl, OpenBSD, FreeBSD, + * MinGW, etc.) lack the symbol and produce a link error. Guard for + * all compilers: non-glibc platforms use the inline fallback below. */ #if \ - ((!defined(HEDLEY_EMSCRIPTEN_VERSION) || HEDLEY_EMSCRIPTEN_VERSION_CHECK(3, 1, 43)) && HEDLEY_HAS_BUILTIN(__builtin_roundeven)) || \ - HEDLEY_GCC_VERSION_CHECK(10,0,0) + ((!defined(HEDLEY_EMSCRIPTEN_VERSION) || HEDLEY_EMSCRIPTEN_VERSION_CHECK(3, 1, 43)) && \ + HEDLEY_HAS_BUILTIN(__builtin_roundeven) && \ + (defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25)))) #define simde_math_roundeven(v) __builtin_roundeven(v) #elif defined(simde_math_round) && defined(simde_math_fabs) static HEDLEY_INLINE @@ -1285,9 +1298,13 @@ simde_math_fpclass(double v, const int imm8) { #endif #if !defined(simde_math_roundevenf) + /* Same rationale as simde_math_roundeven above; applies to the float + * variant. Both GCC and Clang emit a roundevenf() libm call on + * targets without a native instruction. */ #if \ - ((!defined(HEDLEY_EMSCRIPTEN_VERSION) || HEDLEY_EMSCRIPTEN_VERSION_CHECK(3, 1, 43)) && HEDLEY_HAS_BUILTIN(__builtin_roundevenf)) || \ - HEDLEY_GCC_VERSION_CHECK(10,0,0) + ((!defined(HEDLEY_EMSCRIPTEN_VERSION) || HEDLEY_EMSCRIPTEN_VERSION_CHECK(3, 1, 43)) && \ + HEDLEY_HAS_BUILTIN(__builtin_roundevenf) && \ + (defined(__GLIBC__) && ((__GLIBC__ > 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25)))) #define simde_math_roundevenf(v) __builtin_roundevenf(v) #elif defined(simde_math_roundf) && defined(simde_math_fabsf) static HEDLEY_INLINE @@ -1915,7 +1932,7 @@ simde_math_adds_u32(uint32_t a, uint32_t b) { return vqadds_u32(a, b); #else uint32_t r = a + b; - r |= -(r < a); + r |= HEDLEY_STATIC_CAST(uint32_t, -(r < a)); return r; #endif } @@ -1927,7 +1944,7 @@ simde_math_adds_u64(uint64_t a, uint64_t b) { return vqaddd_u64(a, b); #else uint64_t r = a + b; - r |= -(r < a); + r |= HEDLEY_STATIC_CAST(uint64_t, -(r < a)); return r; #endif } @@ -2043,7 +2060,7 @@ simde_math_subs_u32(uint32_t a, uint32_t b) { return vqsubs_u32(a, b); #else uint32_t res = a - b; - res &= -(res <= a); + res &= HEDLEY_STATIC_CAST(uint32_t, -(res <= a)); return res; #endif } @@ -2055,7 +2072,7 @@ simde_math_subs_u64(uint64_t a, uint64_t b) { return vqsubd_u64(a, b); #else uint64_t res = a - b; - res &= -(res <= a); + res &= HEDLEY_STATIC_CAST(uint64_t, -(res <= a)); return res; #endif } diff --git a/thirdparty/simde/wasm/simd128.h b/thirdparty/simde/wasm/simd128.h index 8a9121b52..bfbf6d3d2 100644 --- a/thirdparty/simde/wasm/simd128.h +++ b/thirdparty/simde/wasm/simd128.h @@ -1050,7 +1050,7 @@ simde_wasm_u8x16_splat (uint8_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_wasm_i8x16_const_splat(a) wasm_i8x16_const_splat((a)) #else - #define simde_wasm_i8x16_const_splat(a) simde_wasm_i8x16_splat(a); + #define simde_wasm_i8x16_const_splat(a) simde_wasm_i8x16_splat(a) #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) #define wasm_i8x16_const_splat(a) simde_wasm_i8x16_const_splat((a)) @@ -1059,7 +1059,7 @@ simde_wasm_u8x16_splat (uint8_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_wasm_u8x16_const_splat(a) wasm_u8x16_const_splat((a)) #else - #define simde_wasm_u8x16_const_splat(a) simde_wasm_u8x16_splat(a); + #define simde_wasm_u8x16_const_splat(a) simde_wasm_u8x16_splat(a) #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) #define wasm_u8x16_const_splat(a) simde_wasm_u8x16_const_splat((a)) @@ -1124,7 +1124,7 @@ simde_wasm_u16x8_splat (uint16_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_wasm_i16x8_const_splat(a) wasm_i16x8_const_splat((a)) #else - #define simde_wasm_i16x8_const_splat(a) simde_wasm_i16x8_splat(a); + #define simde_wasm_i16x8_const_splat(a) simde_wasm_i16x8_splat(a) #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) #define wasm_i16x8_const_splat(a) simde_wasm_i16x8_const_splat((a)) @@ -1133,7 +1133,7 @@ simde_wasm_u16x8_splat (uint16_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_wasm_u16x8_const_splat(a) wasm_u16x8_const_splat((a)) #else - #define simde_wasm_u16x8_const_splat(a) simde_wasm_u16x8_splat(a); + #define simde_wasm_u16x8_const_splat(a) simde_wasm_u16x8_splat(a) #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) #define wasm_u16x8_const_splat(a) simde_wasm_u16x8_const_splat((a)) @@ -1198,7 +1198,7 @@ simde_wasm_u32x4_splat (uint32_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_wasm_i32x4_const_splat(a) wasm_i32x4_const_splat((a)) #else - #define simde_wasm_i32x4_const_splat(a) simde_wasm_i32x4_splat(a); + #define simde_wasm_i32x4_const_splat(a) simde_wasm_i32x4_splat(a) #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) #define wasm_i32x4_const_splat(a) simde_wasm_i32x4_const_splat((a)) @@ -1207,7 +1207,7 @@ simde_wasm_u32x4_splat (uint32_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_wasm_u32x4_const_splat(a) wasm_u32x4_const_splat((a)) #else - #define simde_wasm_u32x4_const_splat(a) simde_wasm_u32x4_splat(a); + #define simde_wasm_u32x4_const_splat(a) simde_wasm_u32x4_splat(a) #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) #define wasm_u32x4_const_splat(a) simde_wasm_u32x4_const_splat((a)) @@ -1272,7 +1272,7 @@ simde_wasm_u64x2_splat (uint64_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_wasm_i64x2_const_splat(a) wasm_i64x2_const_splat((a)) #else - #define simde_wasm_i64x2_const_splat(a) simde_wasm_i64x2_splat(a); + #define simde_wasm_i64x2_const_splat(a) simde_wasm_i64x2_splat(a) #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) #define wasm_i64x2_const_splat(a) simde_wasm_i64x2_const_splat((a)) @@ -1281,7 +1281,7 @@ simde_wasm_u64x2_splat (uint64_t a) { #if defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_wasm_u64x2_const_splat(a) wasm_u64x2_const_splat((a)) #else - #define simde_wasm_u64x2_const_splat(a) simde_wasm_u64x2_splat(a); + #define simde_wasm_u64x2_const_splat(a) simde_wasm_u64x2_splat(a) #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) #define wasm_i64x2_const_splat(a) simde_wasm_i64x2_const_splat((a)) @@ -3574,7 +3574,7 @@ simde_wasm_i16x8_abs (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < INT8_C(0)) ? -a_.i16[i] : a_.i16[i]; + r_.i16[i] = (a_.i16[i] < INT8_C(0)) ? HEDLEY_STATIC_CAST(int16_t, -a_.i16[i]) : a_.i16[i]; } #endif @@ -4244,7 +4244,7 @@ simde_wasm_i8x16_shr (simde_v128_t a, uint32_t count) { r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(-HEDLEY_STATIC_CAST(int8_t, count & 7))); + r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, -HEDLEY_STATIC_CAST(int8_t, count & 7)))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i8 = vec_sra(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count & 7))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) @@ -4276,7 +4276,7 @@ simde_wasm_i16x8_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_sra_epi16(a_.sse_m128i, _mm_cvtsi32_si128(count & 15)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(-HEDLEY_STATIC_CAST(int16_t, count & 15))); + r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -HEDLEY_STATIC_CAST(int16_t, count & 15)))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i16 = vec_sra(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count & 15))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) @@ -4373,7 +4373,7 @@ simde_wasm_u8x16_shr (simde_v128_t a, uint32_t count) { r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vshlq_u8(a_.neon_u8, vdupq_n_s8(-HEDLEY_STATIC_CAST(int8_t, count & 7))); + r_.neon_u8 = vshlq_u8(a_.neon_u8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, -HEDLEY_STATIC_CAST(int8_t, count & 7)))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u8 = vec_sr(a_.altivec_u8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count & 7))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) @@ -4405,7 +4405,7 @@ simde_wasm_u16x8_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_srl_epi16(a_.sse_m128i, _mm_cvtsi32_si128(count & 15)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(-HEDLEY_STATIC_CAST(int16_t, count & 15))); + r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -HEDLEY_STATIC_CAST(int16_t, count & 15)))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u16 = vec_sr(a_.altivec_u16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count & 15))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) @@ -6039,7 +6039,7 @@ simde_wasm_f64x2_pmin (simde_v128_t a, simde_v128_t b) { #if defined(SIMDE_X86_SSE2_NATIVE) r_.sse_m128d = _mm_min_pd(b_.sse_m128d, a_.sse_m128d); #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vminq_f64(a_.neon_f64, b_.neon_f64); + r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) diff --git a/thirdparty/simde/x86/aes.h b/thirdparty/simde/x86/aes.h index 1d5b04926..cda2883bd 100644 --- a/thirdparty/simde/x86/aes.h +++ b/thirdparty/simde/x86/aes.h @@ -1,4 +1,4 @@ -/* MIT License +/* SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -33,8 +33,18 @@ * Based on the document FIPS PUB 197 */ +#include + +#include "../hedley.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-common.h" #include "sse2.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + /* * Multiplication in GF(2^8) * http://en.wikipedia.org/wiki/Finite_field_arithmetic @@ -414,4 +424,8 @@ simde__m128i simde_mm_aesimc_si128(simde__m128i a) { #undef simde_x_aes_Nb +SIMDE_END_DECLS_ + +HEDLEY_DIAGNOSTIC_POP + #endif /* !defined(SIMDE_X86_AES_H) */ diff --git a/thirdparty/simde/x86/avx.h b/thirdparty/simde/x86/avx.h index e13b4cc20..fa0b140ac 100644 --- a/thirdparty/simde/x86/avx.h +++ b/thirdparty/simde/x86/avx.h @@ -25,12 +25,26 @@ * 2020 Michael R. Crusoe */ -#include "sse.h" #if !defined(SIMDE_X86_AVX_H) #define SIMDE_X86_AVX_H -#include "sse4.2.h" +#include +#include +#include + +#include "../hedley.h" +#include "../simde-align.h" +#include "../simde-arch.h" +#include "../simde-detect-clang.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-math.h" +#include "../simde-common.h" #include "../simde-f16.h" +#include "sse.h" +#include "sse2.h" +#include "sse4.1.h" +#include "sse4.2.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -392,7 +406,9 @@ simde_mm256_castps_pd (simde__m256 a) { #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) return (simde__m256d)a; #else - return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a); + simde__m256d r; + simde_memcpy(&r, &a, sizeof(r)); + return r; #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -408,7 +424,9 @@ simde_mm256_castps_si256 (simde__m256 a) { #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) return (simde__m256i)a; #else - return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a); + simde__m256i r; + simde_memcpy(&r, &a, sizeof(r)); + return r; #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -424,7 +442,9 @@ simde_mm256_castsi256_pd (simde__m256i a) { #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) return (simde__m256d)a; #else - return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a); + simde__m256d r; + simde_memcpy(&r, &a, sizeof(r)); + return r; #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -440,7 +460,9 @@ simde_mm256_castsi256_ps (simde__m256i a) { #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) return (simde__m256)a; #else - return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a); + simde__m256 r; + simde_memcpy(&r, &a, sizeof(r)); + return r; #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -456,7 +478,9 @@ simde_mm256_castpd_ps (simde__m256d a) { #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) return (simde__m256)a; #else - return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a); + simde__m256 r; + simde_memcpy(&r, &a, sizeof(r)); + return r; #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -472,7 +496,9 @@ simde_mm256_castpd_si256 (simde__m256d a) { #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) return (simde__m256i)a; #else - return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a); + simde__m256i r; + simde_memcpy(&r, &a, sizeof(r)); + return r; #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -493,6 +519,8 @@ simde_mm256_setzero_si256 (void) { #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_setzero_si128(); r_.m128i[1] = simde_mm_setzero_si128(); + #elif defined(SIMDE_VECTOR_SUBSCRIPT) + r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0, 0, 0, 0, 0 }; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { @@ -1016,15 +1044,15 @@ simde_mm256_set_ps (simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_ simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0); + #elif defined(SIMDE_ARCH_LOONGARCH) + simde__m256 tmp_ = { e0, e1, e2, e3, e4, e5, e6, e7 }; + return tmp_; #else simde__m256_private r_; #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) r_.m128[0] = simde_mm_set_ps(e3, e2, e1, e0); r_.m128[1] = simde_mm_set_ps(e7, e6, e5, e4); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - SIMDE_ALIGN_LIKE_32(__m256) simde_float32 data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.i256 = __lasx_xvld(data, 0); #else r_.f32[0] = e0; r_.f32[1] = e1; @@ -1050,6 +1078,9 @@ simde__m256d simde_mm256_set_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_set_pd(e3, e2, e1, e0); + #elif defined(SIMDE_ARCH_LOONGARCH) + simde__m256d tmp_ = { e0, e1, e2, e3 }; + return tmp_; #else simde__m256d_private r_; @@ -2167,10 +2198,12 @@ simde_mm256_castps128_ps256 (simde__m128 a) { #else simde__m256_private r_; simde__m128_private a_ = simde__m128_to_private(a); - + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ r_.m128_private[0] = a_; return simde__m256_from_private(r_); + HEDLEY_DIAGNOSTIC_POP #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -2764,8 +2797,7 @@ simde_mm_cmp_sd (simde__m128d a, simde__m128d b, const int imm8) case SIMDE_CMP_GT_OQ: case SIMDE_CMP_GT_OS: #if defined(SIMDE_LOONGARCH_LASX_NATIVE) - t_ = __lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64); - a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vnor_v(t_, t_), 0x00); + a_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64), 0x00); #else a_.i64[0] = (a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); #endif @@ -2934,8 +2966,7 @@ simde_mm_cmp_ss (simde__m128 a, simde__m128 b, const int imm8) case SIMDE_CMP_GT_OQ: case SIMDE_CMP_GT_OS: #if defined(SIMDE_LOONGARCH_LASX_NATIVE) - t_ = __lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32); - a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vnor_v(t_, t_), 0x00); + a_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32), 0x00); #else a_.i32[0] = (a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); #endif @@ -3175,8 +3206,7 @@ simde_mm256_cmp_pd case SIMDE_CMP_GT_OQ: case SIMDE_CMP_GT_OS: #if defined(SIMDE_LOONGARCH_LASX_NATIVE) - t_ = __lasx_xvfcmp_cle_d(a_.d256, b_.d256); - r_.i256 = __lasx_xvnor_v(t_, t_); + r_.i256 = __lasx_xvfcmp_clt_d(b_.d256, a_.d256); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); #else @@ -3604,22 +3634,31 @@ simde__m128i simde_mm256_cvtpd_epi32 (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvtpd_epi32(a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) simde__m256d_private a_; a_.i256 = __lasx_xvftintrne_w_d(a, a); a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); return a_.m128d_private[0].lsx_i64; #else simde__m128i_private r_; - simde__m256d_private a_ = simde__m256d_to_private(a); + simde__m256d_private a_; - #if defined(simde_math_nearbyint) + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + a_ = simde__m256d_to_private(a); + r_.m64[0] = simde_mm_cvtpd_pi32(a_.m128d[0]); + r_.m64[1] = simde_mm_cvtpd_pi32(a_.m128d[1]); + #else + a_ = simde__m256d_to_private(simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_NEAREST_INT)); SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i])); + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + simde_float64 v = simde_math_round(a_.f64[i]); + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); + #else + r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? + SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; + #endif } - #else - HEDLEY_UNREACHABLE(); #endif return simde__m128i_from_private(r_); @@ -3662,19 +3701,28 @@ simde__m256i simde_mm256_cvtps_epi32 (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvtps_epi32(a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) return __lasx_xvftintrne_w_s(a); #else simde__m256i_private r_; - simde__m256_private a_ = simde__m256_to_private(a); + simde__m256_private a_; - #if defined(simde_math_nearbyintf) + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + a_ = simde__m256_to_private(a); + r_.m128i[0] = simde_mm_cvtps_epi32(a_.m128[0]); + r_.m128i[1] = simde_mm_cvtps_epi32(a_.m128[1]); + #else + a_ = simde__m256_to_private(simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT)); SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[i])); + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + simde_float32 v = simde_math_roundf(a_.f32[i]); + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); + #else + r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? + SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; + #endif } - #else - HEDLEY_UNREACHABLE(); #endif return simde__m256i_from_private(r_); @@ -3775,7 +3823,7 @@ simde__m128i simde_mm256_cvttpd_epi32 (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvttpd_epi32(a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) simde__m256i_private a_; a_.i256 = __lasx_xvftintrz_w_d(a, a); a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); @@ -3784,13 +3832,20 @@ simde_mm256_cvttpd_epi32 (simde__m256d a) { simde__m128i_private r_; simde__m256d_private a_ = simde__m256d_to_private(a); - #if defined(simde_math_trunc) + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + r_.m64[0] = simde_mm_cvttpd_pi32(a_.m128d[0]); + r_.m64[1] = simde_mm_cvttpd_pi32(a_.m128d[1]); + #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_trunc(a_.f64[i])); + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + simde_float64 v = simde_math_trunc(a_.f64[i]); + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); + #else + r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? + SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; + #endif } - #else - HEDLEY_UNREACHABLE(); #endif return simde__m128i_from_private(r_); @@ -3806,19 +3861,26 @@ simde__m256i simde_mm256_cvttps_epi32 (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvttps_epi32(a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) return __lasx_xvftintrz_w_s(a); #else simde__m256i_private r_; simde__m256_private a_ = simde__m256_to_private(a); - #if defined(simde_math_truncf) + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_mm_cvttps_epi32(a_.m128[0]); + r_.m128i[1] = simde_mm_cvttps_epi32(a_.m128[1]); + #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_truncf(a_.f32[i])); + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + simde_float32 v = simde_math_truncf(a_.f32[i]); + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); + #else + r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? + SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; + #endif } - #else - HEDLEY_UNREACHABLE(); #endif return simde__m256i_from_private(r_); @@ -4284,6 +4346,14 @@ simde_mm256_loadu_epi8(void const * mem_addr) { #define _mm256_loadu_epi8(a) simde_mm256_loadu_epi8(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_loadu_epu8(void const * mem_addr) { + simde__m256i r; + simde_memcpy(&r, mem_addr, sizeof(r)); + return r; +} + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) @@ -4309,6 +4379,14 @@ simde_mm256_loadu_epi16(void const * mem_addr) { #define _mm256_loadu_epi16(a) simde_mm256_loadu_epi16(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_loadu_epu16(void const * mem_addr) { + simde__m256i r; + simde_memcpy(&r, mem_addr, sizeof(r)); + return r; +} + #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ && !defined(SIMDE_BUG_CLANG_REV_344862) \ && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) @@ -4334,6 +4412,14 @@ simde_mm256_loadu_epi32(void const * mem_addr) { #define _mm256_loadu_epi32(a) simde_mm256_loadu_epi32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_loadu_epu32(void const * mem_addr) { + simde__m256i r; + simde_memcpy(&r, mem_addr, sizeof(r)); + return r; +} + #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ && !defined(SIMDE_BUG_CLANG_REV_344862) \ && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) @@ -4359,6 +4445,14 @@ simde_mm256_loadu_epi64(void const * mem_addr) { #define _mm256_loadu_epi64(a) simde_mm256_loadu_epi64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_loadu_epu64(void const * mem_addr) { + simde__m256i r; + simde_memcpy(&r, mem_addr, sizeof(r)); + return r; +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_loadu_si256 (void const * mem_addr) { @@ -4718,6 +4812,8 @@ simde_mm256_min_ps (simde__m256 a, simde__m256 b) { #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) return __lasx_xvfmin_s(a, b); #else + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ simde__m256_private r_, a_ = simde__m256_to_private(a), @@ -4734,6 +4830,7 @@ simde_mm256_min_ps (simde__m256 a, simde__m256 b) { #endif return simde__m256_from_private(r_); + HEDLEY_DIAGNOSTIC_POP #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -5166,7 +5263,9 @@ simde_mm256_permute_pd (simde__m256d a, const int imm8) SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_permute_ps (simde__m128 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIALIZED_ simde__m128_private r_, a_ = simde__m128_to_private(a); @@ -5177,6 +5276,7 @@ simde_mm_permute_ps (simde__m128 a, const int imm8) } return simde__m128_from_private(r_); + HEDLEY_DIAGNOSTIC_POP } #if defined(SIMDE_X86_AVX_NATIVE) # define simde_mm_permute_ps(a, imm8) _mm_permute_ps(a, imm8) @@ -5399,7 +5499,7 @@ simde_mm256_permute2f128_si256 (simde__m256i a, simde__m256i b, const int imm8) return simde__m256i_from_private(r_); } #if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute2f128_si128(a, b, imm8) _mm256_permute2f128_si128(a, b, imm8) +# define simde_mm256_permute2f128_si256(a, b, imm8) _mm256_permute2f128_si256(a, b, imm8) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_permute2f128_si256 @@ -5790,7 +5890,7 @@ void simde_mm256_store_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_ps(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); @@ -5806,7 +5906,7 @@ void simde_mm256_store_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_pd(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); @@ -5822,7 +5922,7 @@ void simde_mm256_store_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_store_si256(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); @@ -5838,7 +5938,7 @@ void simde_mm256_storeu_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_ps(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -5854,7 +5954,7 @@ void simde_mm256_storeu_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_pd(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) simde__m256d_private a_ = simde__m256d_to_private(a); @@ -5875,7 +5975,7 @@ void simde_mm256_storeu_si256 (void* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_storeu_si256(SIMDE_ALIGN_CAST(__m256i*, mem_addr), a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -5936,7 +6036,7 @@ void simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_ps(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); @@ -5954,7 +6054,7 @@ void simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_pd(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); @@ -5972,7 +6072,7 @@ void simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_si256(mem_addr, a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) return __lasx_xvst(a, mem_addr, 0); #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); @@ -6603,7 +6703,6 @@ simde_mm256_testc_si256 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_testc_si256(a, b); #else - int_fast32_t r = 0; simde__m256i_private a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); @@ -6612,14 +6711,16 @@ simde_mm256_testc_si256 (simde__m256i a, simde__m256i b) { a_.i256 = __lasx_xvandn_v(a_.i256, b_.i256); a_.i256 = __lasx_xvmsknz_b(a_.i256); return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1; + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + return simde_mm_testc_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testc_si128(a_.m128i[1], b_.m128i[1]); #else + int_fast32_t r = 0; SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { r |= ~a_.i32f[i] & b_.i32f[i]; } + return HEDLEY_STATIC_CAST(int, !r); #endif - - return HEDLEY_STATIC_CAST(int, !r); #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -6761,7 +6862,6 @@ simde_mm256_testz_si256 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_testz_si256(a, b); #else - int_fast32_t r = 0; simde__m256i_private a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); @@ -6771,17 +6871,15 @@ simde_mm256_testz_si256 (simde__m256i a, simde__m256i b) { a_.i256 = __lasx_xvmsknz_b(a_.i256); return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1; #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r = simde_mm_testz_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testz_si128(a_.m128i[1], b_.m128i[1]); + return simde_mm_testz_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testz_si128(a_.m128i[1], b_.m128i[1]); #else + int_fast32_t r = 0; SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { r |= a_.i32f[i] & b_.i32f[i]; } - - r = !r; + return HEDLEY_STATIC_CAST(int, !r); #endif - - return HEDLEY_STATIC_CAST(int, r); #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -6943,25 +7041,27 @@ simde_mm256_testnzc_si256 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_testnzc_si256(a, b); #else - int32_t rc = 0, rz = 0; simde__m256i_private a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + int_fast32_t rc = 0, rz = 0; __m256i m = __lasx_xvandn_v(a_.i256, b_.i256); __m256i n = __lasx_xvand_v(a_.i256, b_.i256); m = __lasx_xvmsknz_b(m); n = __lasx_xvmsknz_b(n); rc = __lasx_xvpickve2gr_w(m, 0) + __lasx_xvpickve2gr_w(m, 4); rz = __lasx_xvpickve2gr_w(n, 0) + __lasx_xvpickve2gr_w(n, 4); return (rc != 0) && (rz != 0); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + return simde_mm_testnzc_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testnzc_si128(a_.m128i[1], b_.m128i[1]); #else + int_fast32_t rc = 0, rz = 0; for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { rc |= ~a_.i32f[i] & b_.i32f[i]; rz |= a_.i32f[i] & b_.i32f[i]; } - - return !!(rc & rz); + return HEDLEY_STATIC_CAST(int, rc && rz); #endif #endif } diff --git a/thirdparty/simde/x86/avx2.h b/thirdparty/simde/x86/avx2.h index ad939e90a..3bb2f4e17 100644 --- a/thirdparty/simde/x86/avx2.h +++ b/thirdparty/simde/x86/avx2.h @@ -31,6 +31,7 @@ #define SIMDE_X86_AVX2_H #include "avx.h" +#include "sse4.1.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -54,7 +55,7 @@ simde_mm256_abs_epi8 (simde__m256i a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; + r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? HEDLEY_STATIC_CAST(int8_t, -a_.i8[i]) : a_.i8[i]; } #endif @@ -84,7 +85,7 @@ simde_mm256_abs_epi16 (simde__m256i a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; + r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? HEDLEY_STATIC_CAST(int16_t, -a_.i16[i]) : a_.i16[i]; } #endif @@ -627,8 +628,12 @@ simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8) return simde__m256i_from_private(r_); } -#if defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_BUG_CLANG_REV_234560) -# define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8)) +#if defined(SIMDE_X86_AVX2_NATIVE) +# if defined(SIMDE_BUG_CLANG_REV_234560) +# define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8)) +# else +# define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8) +# endif #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) # undef sr # define sr(i, j) -((i >> j) & 1) @@ -1119,7 +1124,7 @@ simde_mm256_bsrli_epi128 (simde__m256i a, const int imm8) SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) #define simde_mm256_bsrli_epi128(a, imm8) _mm256_bsrli_epi128(a, imm8) #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - #define simde_mm256_bslli_epi128(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_d(0) : __lasx_xvbsll_v(a, imm8)) + #define simde_mm256_bsrli_epi128(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_d(0) : __lasx_xvbsrl_v(a, imm8)) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm256_bsrli_epi128 @@ -1157,6 +1162,68 @@ simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) { #define _mm256_cmpeq_epi8(a, b) simde_mm256_cmpeq_epi8(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpneq_epi8 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpneq_epi8(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpneq_epi8(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] != b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpeq_epu8 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpeq_epu8(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpeq_epu8(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { + r_.u8[i] = (a_.u8[i] == b_.u8[i]) ? UINT8_MAX : UINT8_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpneq_epu8 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpneq_epu8(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpneq_epu8(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { + r_.u8[i] = (a_.u8[i] != b_.u8[i]) ? UINT8_MAX : UINT8_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) { @@ -1188,6 +1255,69 @@ simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) { #define _mm256_cmpeq_epi16(a, b) simde_mm256_cmpeq_epi16(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpneq_epi16 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpneq_epi16(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpneq_epi16(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] != b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpeq_epu16 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpeq_epu16(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpeq_epu16(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] == b_.u16[i]) ? UINT16_MAX : UINT16_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpneq_epu16 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpneq_epu16(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpneq_epu16(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] != b_.u16[i]) ? UINT16_MAX : UINT16_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) { @@ -1219,6 +1349,70 @@ simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) { #define _mm256_cmpeq_epi32(a, b) simde_mm256_cmpeq_epi32(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpneq_epi32 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_mm_x_cmpneq_epi32(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_mm_x_cmpneq_epi32(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] != b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpeq_epu32 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpeq_epu32(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpeq_epu32(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] == b_.u32[i]) ? UINT32_MAX : UINT32_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpneq_epu32 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpneq_epu32(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpneq_epu32(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] != b_.u32[i]) ? UINT32_MAX : UINT32_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) { @@ -1250,6 +1444,69 @@ simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) { #define _mm256_cmpeq_epi64(a, b) simde_mm256_cmpeq_epi64(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpneq_epi64 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpneq_epi64(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpneq_epi64(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = (a_.i64[i] != b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpeq_epu64 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpeq_epu64(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpeq_epu64(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? UINT64_MAX : UINT64_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_cmpneq_epu64 (simde__m256i a, simde__m256i b) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_x_mm_cmpneq_epu64(a_.m128i[0], b_.m128i[0]); + r_.m128i[1] = simde_x_mm_cmpneq_epu64(a_.m128i[1], b_.m128i[1]); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] != b_.u64[i]) ? UINT64_MAX : UINT64_C(0); + } + #endif + + return simde__m256i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) { @@ -4061,15 +4318,23 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_permute4x64_epi64 (simde__m256i a, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - + simde__m256i_private a_ = simde__m256i_to_private(a); +#if defined(__cplusplus) && __cplusplus >= 202002L + simde__m256i_private r_ = { + .i64 = { + (imm8 & 0x02) ? a_.i64[((imm8 ) & 1)+2] : a_.i64[(imm8 ) & 1], + (imm8 & 0x08) ? a_.i64[((imm8 >> 2 ) & 1)+2] : a_.i64[(imm8 >> 2 ) & 1], + (imm8 & 0x20) ? a_.i64[((imm8 >> 4 ) & 1)+2] : a_.i64[(imm8 >> 4 ) & 1], + (imm8 & 0x80) ? a_.i64[((imm8 >> 6 ) & 1)+2] : a_.i64[(imm8 >> 6 ) & 1], + } + }; +#else + simde__m256i_private r_; r_.i64[0] = (imm8 & 0x02) ? a_.i64[((imm8 ) & 1)+2] : a_.i64[(imm8 ) & 1]; r_.i64[1] = (imm8 & 0x08) ? a_.i64[((imm8 >> 2 ) & 1)+2] : a_.i64[(imm8 >> 2 ) & 1]; r_.i64[2] = (imm8 & 0x20) ? a_.i64[((imm8 >> 4 ) & 1)+2] : a_.i64[(imm8 >> 4 ) & 1]; r_.i64[3] = (imm8 & 0x80) ? a_.i64[((imm8 >> 6 ) & 1)+2] : a_.i64[(imm8 >> 6 ) & 1]; - +#endif return simde__m256i_from_private(r_); } #if defined(SIMDE_X86_AVX2_NATIVE) @@ -4370,7 +4635,7 @@ simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] == INT8_C(0)) ? INT8_C(0) : (b_.i8[i] < INT8_C(0)) ? -a_.i8[i] : a_.i8[i]; + r_.i8[i] = (b_.i8[i] == INT8_C(0)) ? INT8_C(0) : (b_.i8[i] < INT8_C(0)) ? HEDLEY_STATIC_CAST(int8_t, -a_.i8[i]) : a_.i8[i]; } return simde__m256i_from_private(r_); @@ -4396,7 +4661,7 @@ simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] == INT16_C(0)) ? INT16_C(0) : (b_.i16[i] < INT16_C(0)) ? -a_.i16[i] : a_.i16[i]; + r_.i16[i] = (b_.i16[i] == INT16_C(0)) ? INT16_C(0) : (b_.i16[i] < INT16_C(0)) ? HEDLEY_STATIC_CAST(int16_t, -a_.i16[i]) : a_.i16[i]; } return simde__m256i_from_private(r_); @@ -4438,30 +4703,29 @@ simde__m256i simde_mm256_sll_epi16 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sll_epi16(a, count); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - return __lasx_xvsll_h(a, __lasx_xvreplgr2vr_h(count[0])); #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); + simde__m256i_private r_, a_; #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + a_ = simde__m256i_to_private(a); r_.m128i[0] = simde_mm_sll_epi16(a_.m128i[0], count); r_.m128i[1] = simde_mm_sll_epi16(a_.m128i[1], count); #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 15) + simde__m128i_private count_ = simde__m128i_to_private(count); + if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { return simde_mm256_setzero_si256(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m256i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, shift); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsll_h(a_.i256, __lasx_xvreplgr2vr_h(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(16, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (shift)); + r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << cnt); } #endif #endif @@ -4479,30 +4743,29 @@ simde__m256i simde_mm256_sll_epi32 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sll_epi32(a, count); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - return __lasx_xvsll_w(a, __lasx_xvreplgr2vr_w(count[0])); #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); + simde__m256i_private r_, a_; #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + a_ = simde__m256i_to_private(a); r_.m128i[0] = simde_mm_sll_epi32(a_.m128i[0], count); r_.m128i[1] = simde_mm_sll_epi32(a_.m128i[1], count); #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 31) + simde__m128i_private count_ = simde__m128i_to_private(count); + if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { return simde_mm256_setzero_si256(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m256i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, shift); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsll_w(a_.i256, __lasx_xvreplgr2vr_w(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 << SIMDE_CAST_VECTOR_SHIFT_COUNT(32, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] << (shift)); + r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] << cnt); } #endif #endif @@ -4520,30 +4783,29 @@ simde__m256i simde_mm256_sll_epi64 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sll_epi64(a, count); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - return __lasx_xvsll_d(a, __lasx_xvreplgr2vr_d(count[0])); #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); + simde__m256i_private r_, a_; #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + a_ = simde__m256i_to_private(a); r_.m128i[0] = simde_mm_sll_epi64(a_.m128i[0], count); r_.m128i[1] = simde_mm_sll_epi64(a_.m128i[1], count); #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 63) + simde__m128i_private count_ = simde__m128i_to_private(count); + if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { return simde_mm256_setzero_si256(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m256i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, shift); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsll_d(a_.i256, __lasx_xvreplgr2vr_d(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i64 = a_.i64 << SIMDE_CAST_VECTOR_SHIFT_COUNT(64, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] << (shift)); + r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] << cnt); } #endif #endif @@ -4573,6 +4835,8 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8) for (size_t i = 0 ; i < (sizeof(a_.altivec_i16) / sizeof(a_.altivec_i16[0])) ; i++) { r_.altivec_i16[i] = vec_sl(a_.altivec_i16[i], sv); } + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v16i16)a_.i256 << imm8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8); #else @@ -4586,8 +4850,6 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_slli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvslli_h(a, imm8 & 15)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -4612,6 +4874,8 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8) for (size_t i = 0 ; i < (sizeof(a_.altivec_i32) / sizeof(a_.altivec_i32[0])) ; i++) { r_.altivec_i32[i] = vec_sl(a_.altivec_i32[i], sv); } + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v8i32)a_.i256 << imm8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, imm8); #else @@ -4625,8 +4889,6 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_slli_epi32(a, imm8) (imm8 > 31 ? __lasx_xvreplgr2vr_w(0) : __lasx_xvslli_w(a, imm8 & 31)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -4646,7 +4908,9 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8) r_, a_ = simde__m256i_to_private(a); -#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) +#if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v4i64)a_.i256 << imm8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, imm8); #else SIMDE_VECTORIZE @@ -4659,8 +4923,6 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_slli_epi64(a, imm8) (imm8 > 63 ? __lasx_xvreplgr2vr_d(0) : __lasx_xvslli_d(a, imm8)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi64(a, imm8) \ simde_mm256_set_m128i( \ @@ -4847,8 +5109,6 @@ simde__m256i simde_mm256_sra_epi16 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sra_epi16(a, count); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - return __lasx_xvsra_h(a, __lasx_xvreplgr2vr_h(count[0] > 15 ? 15 : count[0])); #else simde__m256i_private r_, @@ -4858,19 +5118,17 @@ simde_mm256_sra_epi16 (simde__m256i a, simde__m128i count) { r_.m128i[0] = simde_mm_sra_epi16(a_.m128i[0], count); r_.m128i[1] = simde_mm_sra_epi16(a_.m128i[1], count); #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - - if (shift > 15) shift = 15; + simde__m128i_private count_ = simde__m128i_to_private(count); + const int cnt = count_.u64[0] > 15 ? 15 : HEDLEY_STATIC_CAST(int, count_.u64[0]); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsra_h(a_.i256, __lasx_xvreplgr2vr_h(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i16 = a_.i16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> shift; + r_.i16[i] = a_.i16[i] >> cnt; } #endif #endif @@ -4888,8 +5146,6 @@ simde__m256i simde_mm256_sra_epi32 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_sra_epi32(a, count); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - return __lasx_xvsra_w(a, __lasx_xvreplgr2vr_w(count[0] > 31 ? 31 : count[0])); #else simde__m256i_private r_, @@ -4899,18 +5155,17 @@ simde_mm256_sra_epi32 (simde__m256i a, simde__m128i count) { r_.m128i[0] = simde_mm_sra_epi32(a_.m128i[0], count); r_.m128i[1] = simde_mm_sra_epi32(a_.m128i[1], count); #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); + simde__m128i_private count_ = simde__m128i_to_private(count); + const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]); - if (shift > 31) shift = 31; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsra_w(a_.i256, __lasx_xvreplgr2vr_w(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> shift; + r_.i32[i] = a_.i32[i] >> cnt; } #endif #endif @@ -4934,7 +5189,9 @@ simde_mm256_srai_epi16 (simde__m256i a, const int imm8) if (shift > 15) shift = 15; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v16i16)a_.i256 >> shift); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift); #else SIMDE_VECTORIZE @@ -4947,8 +5204,6 @@ simde_mm256_srai_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srai_epi16(a, imm8) __lasx_xvsrai_h(a, (imm8 > 15 ? 15 : imm8)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srai_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -4971,7 +5226,9 @@ simde_mm256_srai_epi32 (simde__m256i a, const int imm8) if (shift > 31) shift = 31; - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v8i32)a_.i256 >> shift); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift); #else SIMDE_VECTORIZE @@ -4984,8 +5241,6 @@ simde_mm256_srai_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srai_epi32(a, imm8) __lasx_xvsrai_w(a, (imm8 > 31 ? 31 : imm8)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srai_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -5067,28 +5322,29 @@ simde__m256i simde_mm256_srl_epi16 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_srl_epi16(a, count); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - return __lasx_xvsrl_h(a, __lasx_xvreplgr2vr_h(count[0] > 16 ? 16 : count[0])); #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); + simde__m256i_private r_, a_; #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + a_ = simde__m256i_to_private(a); r_.m128i[0] = simde_mm_srl_epi16(a_.m128i[0], count); r_.m128i[1] = simde_mm_srl_epi16(a_.m128i[1], count); #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 16 ? 16 : count_.i64[0])); + simde__m128i_private count_ = simde__m128i_to_private(count); + if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { + return simde_mm256_setzero_si256(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m256i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, shift); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsrl_h(a_.i256, __lasx_xvreplgr2vr_h(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, cnt); #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> (shift); + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = a_.u16[i] >> cnt; } #endif #endif @@ -5106,28 +5362,29 @@ simde__m256i simde_mm256_srl_epi32 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_srl_epi32(a, count); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - return __lasx_xvsrl_w(a, __lasx_xvreplgr2vr_w(count[0] > 32 ? 32 : count[0])); #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); + simde__m256i_private r_, a_; #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + a_ = simde__m256i_to_private(a); r_.m128i[0] = simde_mm_srl_epi32(a_.m128i[0], count); r_.m128i[1] = simde_mm_srl_epi32(a_.m128i[1], count); #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 32 ? 32 : count_.i64[0])); + simde__m128i_private count_ = simde__m128i_to_private(count); + if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { + return simde_mm256_setzero_si256(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m256i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, shift); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsrl_w(a_.i256, __lasx_xvreplgr2vr_w(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, cnt); #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> (shift); + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = a_.u32[i] >> cnt; } #endif #endif @@ -5145,28 +5402,29 @@ simde__m256i simde_mm256_srl_epi64 (simde__m256i a, simde__m128i count) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_srl_epi64(a, count); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - return __lasx_xvsrl_d(a, __lasx_xvreplgr2vr_d(count[0] > 64 ? 64 : count[0])); #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); + simde__m256i_private r_, a_; #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) + a_ = simde__m256i_to_private(a); r_.m128i[0] = simde_mm_srl_epi64(a_.m128i[0], count); r_.m128i[1] = simde_mm_srl_epi64(a_.m128i[1], count); #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 64 ? 64 : count_.i64[0])); + simde__m128i_private count_ = simde__m128i_to_private(count); + if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { + return simde_mm256_setzero_si256(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m256i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(64, shift); + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = __lasx_xvsrl_d(a_.i256, __lasx_xvreplgr2vr_d(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(64, cnt); #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> (shift); + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = a_.u64[i] >> cnt; } #endif #endif @@ -5195,6 +5453,8 @@ simde_mm256_srli_epi16 (simde__m256i a, const int imm8) for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) { r_.altivec_u16[i] = vec_sr(a_.altivec_u16[i], sv); } + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v16u16)a_.i256 >> imm8); #else if (HEDLEY_STATIC_CAST(unsigned int, imm8) > 15) { simde_memset(&r_, 0, sizeof(r_)); @@ -5214,8 +5474,6 @@ simde_mm256_srli_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srli_epi16(a, imm8) (imm8 > 15 ? __lasx_xvreplgr2vr_h(0) : __lasx_xvsrli_h(a, imm8 & 15)) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi16(a, imm8) \ simde_mm256_set_m128i( \ @@ -5240,6 +5498,8 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) for (size_t i = 0 ; i < (sizeof(a_.altivec_u32) / sizeof(a_.altivec_u32[0])) ; i++) { r_.altivec_u32[i] = vec_sr(a_.altivec_u32[i], sv); } + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v8u32)a_.i256 >> imm8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8); #else @@ -5253,8 +5513,6 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srli_epi32(a, imm8) __lasx_xvsrli_w(a, imm8) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi32(a, imm8) \ simde_mm256_set_m128i( \ @@ -5274,7 +5532,9 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) r_, a_ = simde__m256i_to_private(a); -#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) +#if defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.i256 = (simde__m256i)((v4u64)a_.i256 >> imm8); +#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, imm8); #else SIMDE_VECTORIZE @@ -5287,8 +5547,6 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8) -#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) -# define simde_mm256_srli_epi64(a, imm8) __lasx_xvsrli_d(a, imm8) #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi64(a, imm8) \ simde_mm256_set_m128i( \ diff --git a/thirdparty/simde/x86/avx512.h b/thirdparty/simde/x86/avx512.h index e3654bc37..cf01de1ce 100644 --- a/thirdparty/simde/x86/avx512.h +++ b/thirdparty/simde/x86/avx512.h @@ -64,6 +64,7 @@ #include "avx512/dpwssd.h" #include "avx512/dpwssds.h" #include "avx512/expand.h" +#include "avx512/expandloadu.h" #include "avx512/extract.h" #include "avx512/fixupimm.h" #include "avx512/fixupimm_round.h" diff --git a/thirdparty/simde/x86/avx512/4dpwssd.h b/thirdparty/simde/x86/avx512/4dpwssd.h index 2139099f3..daebca098 100644 --- a/thirdparty/simde/x86/avx512/4dpwssd.h +++ b/thirdparty/simde/x86/avx512/4dpwssd.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_4DPWSSD_H) #define SIMDE_X86_AVX512_4DPWSSD_H diff --git a/thirdparty/simde/x86/avx512/4dpwssds.h b/thirdparty/simde/x86/avx512/4dpwssds.h index ef8cf9780..ee2b26e10 100644 --- a/thirdparty/simde/x86/avx512/4dpwssds.h +++ b/thirdparty/simde/x86/avx512/4dpwssds.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_4DPWSSDS_H) #define SIMDE_X86_AVX512_4DPWSSDS_H diff --git a/thirdparty/simde/x86/avx512/abs.h b/thirdparty/simde/x86/avx512/abs.h index 5ff001485..a037c6e42 100644 --- a/thirdparty/simde/x86/avx512/abs.h +++ b/thirdparty/simde/x86/avx512/abs.h @@ -189,6 +189,90 @@ simde_mm_maskz_abs_epi64(simde__mmask8 k, simde__m128i a) { #define _mm_maskz_abs_epi64(k, a) simde_mm_maskz_abs_epi64(k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_abs_epi8(simde__m256i src, simde__mmask32 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_abs_epi8(src, k, a); + #else + return simde_mm256_mask_mov_epi8(src, k, simde_mm256_abs_epi8(a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_abs_epi8 + #define _mm256_mask_abs_epi8(src, k, a) simde_mm256_mask_abs_epi8(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_abs_epi8(simde__mmask32 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_abs_epi8(k, a); + #else + return simde_mm256_maskz_mov_epi8(k, simde_mm256_abs_epi8(a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_abs_epi8 + #define _mm256_maskz_abs_epi8(k, a) simde_mm256_maskz_abs_epi8(k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_abs_epi16(simde__m256i src, simde__mmask16 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_abs_epi16(src, k, a); + #else + return simde_mm256_mask_mov_epi16(src, k, simde_mm256_abs_epi16(a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_abs_epi16 + #define _mm256_mask_abs_epi16(src, k, a) simde_mm256_mask_abs_epi16(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_abs_epi16(simde__mmask16 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_abs_epi16(k, a); + #else + return simde_mm256_maskz_mov_epi16(k, simde_mm256_abs_epi16(a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_abs_epi16 + #define _mm256_maskz_abs_epi16(k, a) simde_mm256_maskz_abs_epi16(k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_abs_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_abs_epi32(src, k, a); + #else + return simde_mm256_mask_mov_epi32(src, k, simde_mm256_abs_epi32(a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_abs_epi32 + #define _mm256_mask_abs_epi32(src, k, a) simde_mm256_mask_abs_epi32(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_abs_epi32(simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_abs_epi32(k, a); + #else + return simde_mm256_maskz_mov_epi32(k, simde_mm256_abs_epi32(a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_abs_epi32 + #define _mm256_maskz_abs_epi32(k, a) simde_mm256_maskz_abs_epi32(k, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_abs_epi64(simde__m256i a) { @@ -263,7 +347,7 @@ simde_mm512_abs_epi8 (simde__m512i a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; + r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? HEDLEY_STATIC_CAST(int8_t, -a_.i8[i]) : a_.i8[i]; } #endif @@ -320,7 +404,7 @@ simde_mm512_abs_epi16 (simde__m512i a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; + r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? HEDLEY_STATIC_CAST(int16_t, -a_.i16[i]) : a_.i16[i]; } #endif diff --git a/thirdparty/simde/x86/avx512/add.h b/thirdparty/simde/x86/avx512/add.h index d192b2f57..109a7a69b 100644 --- a/thirdparty/simde/x86/avx512/add.h +++ b/thirdparty/simde/x86/avx512/add.h @@ -148,6 +148,62 @@ simde_mm_maskz_add_epi64(simde__mmask8 k, simde__m128i a, simde__m128i b) { #define _mm_maskz_add_epi64(k, a, b) simde_mm_maskz_add_epi64(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_mask_add_ps(simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_add_ps(src, k, a, b); + #else + return simde_mm_mask_mov_ps(src, k, simde_mm_add_ps(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_add_ps + #define _mm_mask_add_ps(src, k, a, b) simde_mm_mask_add_ps(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_maskz_add_ps(simde__mmask8 k, simde__m128 a, simde__m128 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_add_ps(k, a, b); + #else + return simde_mm_maskz_mov_ps(k, simde_mm_add_ps(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_add_ps + #define _mm_maskz_add_ps(k, a, b) simde_mm_maskz_add_ps(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_mask_add_pd(simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_add_pd(src, k, a, b); + #else + return simde_mm_mask_mov_pd(src, k, simde_mm_add_pd(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_add_pd + #define _mm_mask_add_pd(src, k, a, b) simde_mm_mask_add_pd(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_maskz_add_pd(simde__mmask8 k, simde__m128d a, simde__m128d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_add_pd(k, a, b); + #else + return simde_mm_maskz_mov_pd(k, simde_mm_add_pd(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_add_pd + #define _mm_maskz_add_pd(k, a, b) simde_mm_maskz_add_pd(k, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_mask_add_ss(simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b) { @@ -195,6 +251,81 @@ simde_mm_maskz_add_ss(simde__mmask8 k, simde__m128 a, simde__m128 b) { #define _mm_maskz_add_ss(k, a, b) simde_mm_maskz_add_ss(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_mask_add_sd(simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_mask_add_sd(src, k, a, b); + #elif 1 + simde__m128d_private + src_ = simde__m128d_to_private(src), + a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b), + r_ = simde__m128d_to_private(a); + + r_.f64[0] = (k & 1) ? (a_.f64[0] + b_.f64[0]) : src_.f64[0]; + + return simde__m128d_from_private(r_); + #else + return simde_mm_move_sd(a, simde_mm_mask_mov_pd(src, k, simde_mm_add_pd(a, b))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_add_sd + #define _mm_mask_add_sd(src, k, a, b) simde_mm_mask_add_sd(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_maskz_add_sd(simde__mmask8 k, simde__m128d a, simde__m128d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_maskz_add_sd(k, a, b); + #elif 1 + simde__m128d_private + a_ = simde__m128d_to_private(a), + b_ = simde__m128d_to_private(b), + r_ = simde__m128d_to_private(a); + + r_.f64[0] = (k & 1) ? (a_.f64[0] + b_.f64[0]) : 0.0; + + return simde__m128d_from_private(r_); + #else + return simde_mm_move_sd(a, simde_mm_maskz_mov_pd(k, simde_mm_add_pd(a, b))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_add_sd + #define _mm_maskz_add_sd(k, a, b) simde_mm_maskz_add_sd(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_add_epi8(simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm256_mask_add_epi8(src, k, a, b); + #else + return simde_mm256_mask_mov_epi8(src, k, simde_mm256_add_epi8(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_add_epi8 + #define _mm256_mask_add_epi8(src, k, a, b) simde_mm256_mask_add_epi8(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_add_epi8(simde__mmask32 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm256_maskz_add_epi8(k, a, b); + #else + return simde_mm256_maskz_mov_epi8(k, simde_mm256_add_epi8(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_add_epi8 + #define _mm256_maskz_add_epi8(k, a, b) simde_mm256_maskz_add_epi8(k, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_add_epi16(simde__m256i src, simde__mmask16 k, simde__m256i a, simde__m256i b) { @@ -279,6 +410,62 @@ simde_mm256_maskz_add_epi64(simde__mmask8 k, simde__m256i a, simde__m256i b) { #define _mm256_maskz_add_epi64(k, a, b) simde_mm256_maskz_add_epi64(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_mask_add_ps(simde__m256 src, simde__mmask8 k, simde__m256 a, simde__m256 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_add_ps(src, k, a, b); + #else + return simde_mm256_mask_mov_ps(src, k, simde_mm256_add_ps(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_add_ps + #define _mm256_mask_add_ps(src, k, a, b) simde_mm256_mask_add_ps(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_maskz_add_ps(simde__mmask8 k, simde__m256 a, simde__m256 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_add_ps(k, a, b); + #else + return simde_mm256_maskz_mov_ps(k, simde_mm256_add_ps(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_add_ps + #define _mm256_maskz_add_ps(k, a, b) simde_mm256_maskz_add_ps(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256d +simde_mm256_mask_add_pd(simde__m256d src, simde__mmask8 k, simde__m256d a, simde__m256d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_add_pd(src, k, a, b); + #else + return simde_mm256_mask_mov_pd(src, k, simde_mm256_add_pd(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_add_pd + #define _mm256_mask_add_pd(src, k, a, b) simde_mm256_mask_add_pd(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256d +simde_mm256_maskz_add_pd(simde__mmask8 k, simde__m256d a, simde__m256d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_add_pd(k, a, b); + #else + return simde_mm256_maskz_mov_pd(k, simde_mm256_add_pd(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_add_pd + #define _mm256_maskz_add_pd(k, a, b) simde_mm256_maskz_add_pd(k, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_add_epi8 (simde__m512i a, simde__m512i b) { diff --git a/thirdparty/simde/x86/avx512/adds.h b/thirdparty/simde/x86/avx512/adds.h index 64abffaab..fcba2309f 100644 --- a/thirdparty/simde/x86/avx512/adds.h +++ b/thirdparty/simde/x86/avx512/adds.h @@ -92,6 +92,62 @@ simde_mm_maskz_adds_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { #define _mm_maskz_adds_epi16(k, a, b) simde_mm_maskz_adds_epi16(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_adds_epu8 (simde__m128i src, simde__mmask16 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_adds_epu8(src, k, a, b); + #else + return simde_mm_mask_mov_epi8(src, k, simde_mm_adds_epu8(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_adds_epu8 + #define _mm_mask_adds_epu8(src, k, a, b) simde_mm_mask_adds_epu8(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_adds_epu8 (simde__mmask16 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_adds_epu8(k, a, b); + #else + return simde_mm_maskz_mov_epi8(k, simde_mm_adds_epu8(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_adds_epu8 + #define _mm_maskz_adds_epu8(k, a, b) simde_mm_maskz_adds_epu8(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_adds_epu16 (simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_adds_epu16(src, k, a, b); + #else + return simde_mm_mask_mov_epi16(src, k, simde_mm_adds_epu16(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_adds_epu16 + #define _mm_mask_adds_epu16(src, k, a, b) simde_mm_mask_adds_epu16(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_adds_epu16 (simde__mmask8 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_adds_epu16(k, a, b); + #else + return simde_mm_maskz_mov_epi16(k, simde_mm_adds_epu16(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_adds_epu16 + #define _mm_maskz_adds_epu16(k, a, b) simde_mm_maskz_adds_epu16(k, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_adds_epi8(simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) { @@ -148,6 +204,62 @@ simde_mm256_maskz_adds_epi16(simde__mmask16 k, simde__m256i a, simde__m256i b) { #define _mm256_maskz_adds_epi16(k, a, b) simde_mm256_maskz_adds_epi16(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_adds_epu8 (simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_adds_epu8(src, k, a, b); + #else + return simde_mm256_mask_mov_epi8(src, k, simde_mm256_adds_epu8(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_adds_epu8 + #define _mm256_mask_adds_epu8(src, k, a, b) simde_mm256_mask_adds_epu8(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_adds_epu8 (simde__mmask32 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_adds_epu8(k, a, b); + #else + return simde_mm256_maskz_mov_epi8(k, simde_mm256_adds_epu8(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_adds_epu8 + #define _mm256_maskz_adds_epu8(k, a, b) simde_mm256_maskz_adds_epu8(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_adds_epu16 (simde__m256i src, simde__mmask16 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_adds_epu16(src, k, a, b); + #else + return simde_mm256_mask_mov_epi16(src, k, simde_mm256_adds_epu16(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_adds_epu16 + #define _mm256_mask_adds_epu16(src, k, a, b) simde_mm256_mask_adds_epu16(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_adds_epu16 (simde__mmask16 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_adds_epu16(k, a, b); + #else + return simde_mm256_maskz_mov_epi16(k, simde_mm256_adds_epu16(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_adds_epu16 + #define _mm256_maskz_adds_epu16(k, a, b) simde_mm256_maskz_adds_epu16(k, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_adds_epi8 (simde__m512i a, simde__m512i b) { diff --git a/thirdparty/simde/x86/avx512/and.h b/thirdparty/simde/x86/avx512/and.h index fd7118f86..3dde2e544 100644 --- a/thirdparty/simde/x86/avx512/and.h +++ b/thirdparty/simde/x86/avx512/and.h @@ -38,6 +38,230 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_mask_and_pd(simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_and_pd(src, k, a, b); + #else + return simde_mm_mask_mov_pd(src, k, simde_mm_and_pd(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_and_pd + #define _mm_mask_and_pd(src, k, a, b) simde_mm_mask_and_pd(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_maskz_and_pd(simde__mmask8 k, simde__m128d a, simde__m128d b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_and_pd(k, a, b); + #else + return simde_mm_maskz_mov_pd(k, simde_mm_and_pd(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_and_pd + #define _mm_maskz_and_pd(k, a, b) simde_mm_maskz_and_pd(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_mask_and_ps(simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_and_ps(src, k, a, b); + #else + return simde_mm_mask_mov_ps(src, k, simde_mm_and_ps(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_and_ps + #define _mm_mask_and_ps(src, k, a, b) simde_mm_mask_and_ps(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_maskz_and_ps(simde__mmask8 k, simde__m128 a, simde__m128 b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_and_ps(k, a, b); + #else + return simde_mm_maskz_mov_ps(k, simde_mm_and_ps(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_and_ps + #define _mm_maskz_and_ps(k, a, b) simde_mm_maskz_and_ps(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_and_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_and_epi32(src, k, a, b); + #else + return simde_mm_mask_mov_epi32(src, k, simde_mm_and_si128(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_and_epi32 + #define _mm_mask_and_epi32(src, k, a, b) simde_mm_mask_and_epi32(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_and_epi32(simde__mmask8 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_and_epi32(k, a, b); + #else + return simde_mm_maskz_mov_epi32(k, simde_mm_and_si128(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_and_epi32 + #define _mm_maskz_and_epi32(k, a, b) simde_mm_maskz_and_epi32(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_and_epi64(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_and_epi64(src, k, a, b); + #else + return simde_mm_mask_mov_epi64(src, k, simde_mm_and_si128(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_and_epi64 + #define _mm_mask_and_epi64(src, k, a, b) simde_mm_mask_and_epi64(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_and_epi64(simde__mmask8 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_and_epi64(k, a, b); + #else + return simde_mm_maskz_mov_epi64(k, simde_mm_and_si128(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_and_epi64 + #define _mm_maskz_and_epi64(k, a, b) simde_mm_maskz_and_epi64(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256d +simde_mm256_mask_and_pd(simde__m256d src, simde__mmask8 k, simde__m256d a, simde__m256d b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_and_pd(src, k, a, b); + #else + return simde_mm256_mask_mov_pd(src, k, simde_mm256_and_pd(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_and_pd + #define _mm256_mask_and_pd(src, k, a, b) simde_mm256_mask_and_pd(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256d +simde_mm256_maskz_and_pd(simde__mmask8 k, simde__m256d a, simde__m256d b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_and_pd(k, a, b); + #else + return simde_mm256_maskz_mov_pd(k, simde_mm256_and_pd(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_and_pd + #define _mm256_maskz_and_pd(k, a, b) simde_mm256_maskz_and_pd(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_mask_and_ps(simde__m256 src, simde__mmask8 k, simde__m256 a, simde__m256 b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_and_ps(src, k, a, b); + #else + return simde_mm256_mask_mov_ps(src, k, simde_mm256_and_ps(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_and_ps + #define _mm256_mask_and_ps(src, k, a, b) simde_mm256_mask_and_ps(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_maskz_and_ps(simde__mmask8 k, simde__m256 a, simde__m256 b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_and_ps(k, a, b); + #else + return simde_mm256_maskz_mov_ps(k, simde_mm256_and_ps(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_and_ps + #define _mm256_maskz_and_ps(k, a, b) simde_mm256_maskz_and_ps(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_and_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_and_epi32(src, k, a, b); + #else + return simde_mm256_mask_mov_epi32(src, k, simde_mm256_and_si256(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_and_epi32 + #define _mm256_mask_and_epi32(src, k, a, b) simde_mm256_mask_and_epi32(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_and_epi32(simde__mmask8 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_and_epi32(k, a, b); + #else + return simde_mm256_maskz_mov_epi32(k, simde_mm256_and_si256(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_and_epi32 + #define _mm256_maskz_and_epi32(k, a, b) simde_mm256_maskz_and_epi32(k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_and_epi64(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_and_epi64(src, k, a, b); + #else + return simde_mm256_mask_mov_epi64(src, k, simde_mm256_and_si256(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_and_epi64 + #define _mm256_mask_and_epi64(src, k, a, b) simde_mm256_mask_and_epi64(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_and_epi64(simde__mmask8 k, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_and_epi64(k, a, b); + #else + return simde_mm256_maskz_mov_epi64(k, simde_mm256_and_si256(a, b)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_and_epi64 + #define _mm256_maskz_and_epi64(k, a, b) simde_mm256_maskz_and_epi64(k, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_and_pd (simde__m512d a, simde__m512d b) { diff --git a/thirdparty/simde/x86/avx512/bitshuffle.h b/thirdparty/simde/x86/avx512/bitshuffle.h index 05f4b5c8e..9760e6bf9 100644 --- a/thirdparty/simde/x86/avx512/bitshuffle.h +++ b/thirdparty/simde/x86/avx512/bitshuffle.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_BITSHUFFLE_H) #define SIMDE_X86_AVX512_BITSHUFFLE_H diff --git a/thirdparty/simde/x86/avx512/blend.h b/thirdparty/simde/x86/avx512/blend.h index e34dd20b1..049e66250 100644 --- a/thirdparty/simde/x86/avx512/blend.h +++ b/thirdparty/simde/x86/avx512/blend.h @@ -49,6 +49,12 @@ simde_mm_mask_blend_epi8(simde__mmask16 k, simde__m128i a, simde__m128i b) { #define _mm_mask_blend_epi8(k, a, b) simde_mm_mask_blend_epi8(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mask_blend_epu8(simde__mmask16 k, simde__m128i a, simde__m128i b) { + return simde_x_mm_mask_mov_epu8(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mask_blend_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { @@ -63,6 +69,12 @@ simde_mm_mask_blend_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { #define _mm_mask_blend_epi16(k, a, b) simde_mm_mask_blend_epi16(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mask_blend_epu16(simde__mmask8 k, simde__m128i a, simde__m128i b) { + return simde_x_mm_mask_mov_epu16(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mask_blend_epi32(simde__mmask8 k, simde__m128i a, simde__m128i b) { @@ -77,6 +89,12 @@ simde_mm_mask_blend_epi32(simde__mmask8 k, simde__m128i a, simde__m128i b) { #define _mm_mask_blend_epi32(k, a, b) simde_mm_mask_blend_epi32(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mask_blend_epu32(simde__mmask8 k, simde__m128i a, simde__m128i b) { + return simde_x_mm_mask_mov_epu32(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mask_blend_epi64(simde__mmask8 k, simde__m128i a, simde__m128i b) { @@ -91,6 +109,12 @@ simde_mm_mask_blend_epi64(simde__mmask8 k, simde__m128i a, simde__m128i b) { #define _mm_mask_blend_epi64(k, a, b) simde_mm_mask_blend_epi64(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mask_blend_epu64(simde__mmask8 k, simde__m128i a, simde__m128i b) { + return simde_x_mm_mask_mov_epu64(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_mask_blend_ps(simde__mmask8 k, simde__m128 a, simde__m128 b) { @@ -133,6 +157,12 @@ simde_mm256_mask_blend_epi8(simde__mmask32 k, simde__m256i a, simde__m256i b) { #define _mm256_mask_blend_epi8(k, a, b) simde_mm256_mask_blend_epi8(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_mask_blend_epu8(simde__mmask32 k, simde__m256i a, simde__m256i b) { + return simde_x_mm256_mask_mov_epu8(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_blend_epi16(simde__mmask16 k, simde__m256i a, simde__m256i b) { @@ -147,6 +177,12 @@ simde_mm256_mask_blend_epi16(simde__mmask16 k, simde__m256i a, simde__m256i b) { #define _mm256_mask_blend_epi16(k, a, b) simde_mm256_mask_blend_epi16(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_mask_blend_epu16(simde__mmask16 k, simde__m256i a, simde__m256i b) { + return simde_x_mm256_mask_mov_epu16(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_blend_epi32(simde__mmask8 k, simde__m256i a, simde__m256i b) { @@ -161,6 +197,12 @@ simde_mm256_mask_blend_epi32(simde__mmask8 k, simde__m256i a, simde__m256i b) { #define _mm256_mask_blend_epi32(k, a, b) simde_mm256_mask_blend_epi32(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_mask_blend_epu32(simde__mmask8 k, simde__m256i a, simde__m256i b) { + return simde_x_mm256_mask_mov_epu32(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_blend_epi64(simde__mmask8 k, simde__m256i a, simde__m256i b) { @@ -175,6 +217,12 @@ simde_mm256_mask_blend_epi64(simde__mmask8 k, simde__m256i a, simde__m256i b) { #define _mm256_mask_blend_epi64(k, a, b) simde_mm256_mask_blend_epi64(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_mask_blend_epu64(simde__mmask8 k, simde__m256i a, simde__m256i b) { + return simde_x_mm256_mask_mov_epu64(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256 simde_mm256_mask_blend_ps(simde__mmask8 k, simde__m256 a, simde__m256 b) { @@ -217,6 +265,12 @@ simde_mm512_mask_blend_epi8(simde__mmask64 k, simde__m512i a, simde__m512i b) { #define _mm512_mask_blend_epi8(k, a, b) simde_mm512_mask_blend_epi8(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_x_mm512_mask_blend_epu8(simde__mmask64 k, simde__m512i a, simde__m512i b) { + return simde_x_mm512_mask_mov_epu8(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_blend_epi16(simde__mmask32 k, simde__m512i a, simde__m512i b) { @@ -245,6 +299,12 @@ simde_mm512_mask_blend_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { #define _mm512_mask_blend_epi32(k, a, b) simde_mm512_mask_blend_epi32(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_x_mm512_mask_blend_epu32(simde__mmask16 k, simde__m512i a, simde__m512i b) { + return simde_x_mm512_mask_mov_epu32(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_blend_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { @@ -259,6 +319,12 @@ simde_mm512_mask_blend_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { #define _mm512_mask_blend_epi64(k, a, b) simde_mm512_mask_blend_epi64(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_x_mm512_mask_blend_epu64(simde__mmask8 k, simde__m512i a, simde__m512i b) { + return simde_x_mm512_mask_mov_epu64(a, k, b); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_mask_blend_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { diff --git a/thirdparty/simde/x86/avx512/broadcast.h b/thirdparty/simde/x86/avx512/broadcast.h index 33b41abd1..662e9e936 100644 --- a/thirdparty/simde/x86/avx512/broadcast.h +++ b/thirdparty/simde/x86/avx512/broadcast.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Hidayat Khan * 2020 Christopher Moore + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_BROADCAST_H) @@ -640,6 +641,132 @@ simde_mm512_maskz_broadcastd_epi32(simde__mmask16 k, simde__m128i a) { #define _mm512_maskz_broadcastd_epi32(k, a) simde_mm512_maskz_broadcastd_epi32(k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_broadcastmb_epi64 (simde__mmask8 k) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_broadcastmb_epi64(k); + #else + simde__m128i_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = k; + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_broadcastmb_epi64 + #define _mm_broadcastmb_epi64(k) simde_mm_broadcastmb_epi64(k) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_broadcastmb_epi64 (simde__mmask8 k) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_broadcastmb_epi64(k); + #else + simde__m256i_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = k; + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_broadcastmb_epi64 + #define _mm256_broadcastmb_epi64(k) simde_mm256_broadcastmb_epi64(k) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_broadcastmb_epi64 (simde__mmask8 k) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm512_broadcastmb_epi64(k); + #else + simde__m512i_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = k; + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm512_broadcastmb_epi64 + #define _mm512_broadcastmb_epi64(k) simde_mm512_broadcastmb_epi64(k) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_broadcastmw_epi32 (simde__mmask16 k) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_broadcastmw_epi32(k); + #else + simde__m128i_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = k; + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_broadcastmw_epi32 + #define _mm_broadcastmw_epi32(k) simde_mm_broadcastmw_epi32(k) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_broadcastmw_epi32 (simde__mmask16 k) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_broadcastmw_epi32(k); + #else + simde__m256i_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = k; + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_broadcastmw_epi32 + #define _mm256_broadcastmw_epi32(k) simde_mm256_broadcastmw_epi32(k) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_broadcastmw_epi32 (simde__mmask16 k) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm512_broadcastmw_epi32(k); + #else + simde__m512i_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = k; + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm512_broadcastmw_epi32 + #define _mm512_broadcastmw_epi32(k) simde_mm512_broadcastmw_epi32(k) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_broadcastq_epi64 (simde__m128i a) { diff --git a/thirdparty/simde/x86/avx512/cmpeq.h b/thirdparty/simde/x86/avx512/cmpeq.h index 41f90b3e9..6362b7307 100644 --- a/thirdparty/simde/x86/avx512/cmpeq.h +++ b/thirdparty/simde/x86/avx512/cmpeq.h @@ -23,6 +23,7 @@ * Copyright: * 2020-2021 Evan Nemerson * 2020 Himanshi Mathur + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_CMPEQ_H) @@ -30,7 +31,6 @@ #include "types.h" #include "../avx2.h" -#include "mov.h" #include "mov_mask.h" #include "cmp.h" @@ -38,9 +38,105 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm_cmpeq_epi8_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpeq_epi8_mask(a, b); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask16 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.i8 == b_.i8); + r = simde_mm_movepi8_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] == b_.u8[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpeq_epi8_mask + #define _mm_cmpeq_epi8_mask(a, b) simde_mm_cmpeq_epi8_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm_mask_cmpeq_epi8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpeq_epi8_mask(k1, a, b); + #else + return simde_mm_cmpeq_epi8_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpeq_epi8_mask + #define _mm_mask_cmpeq_epi8_mask(k1, a, b) simde_mm_mask_cmpeq_epi8_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm256_cmpeq_epi8_mask(simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpeq_epi8_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return HEDLEY_STATIC_CAST(simde__mmask32, simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(a, b))); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask32 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.i8 == b_.i8); + r = simde_mm256_movepi8_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] == b_.u8[i]) ? (UINT32_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpeq_epi8_mask + #define _mm256_cmpeq_epi8_mask(a, b) simde_mm256_cmpeq_epi8_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm256_mask_cmpeq_epi8_mask(simde__mmask32 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpeq_epi8_mask(k1, a, b); + #else + return simde_mm256_cmpeq_epi8_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpeq_epi8_mask + #define _mm256_mask_cmpeq_epi8_mask(k1, a, b) simde_mm256_mask_cmpeq_epi8_mask((k1), (a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__mmask64 -simde_mm512_cmpeq_epi8_mask (simde__m512i a, simde__m512i b) { +simde_mm512_cmpeq_epi8_mask(simde__m512i a, simde__m512i b) { #if defined(SIMDE_X86_AVX512BW_NATIVE) return _mm512_cmpeq_epi8_mask(a, b); #else @@ -93,9 +189,236 @@ simde_mm512_mask_cmpeq_epi8_mask(simde__mmask64 k1, simde__m512i a, simde__m512i #define _mm512_mask_cmpeq_epi8_mask(k1, a, b) simde_mm512_mask_cmpeq_epi8_mask((k1), (a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_cmpeq_epi16_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpeq_epi16_mask(a, b); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i16), a_.i16 == b_.i16); + r = simde_mm_movepi16_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] == b_.u16[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpeq_epi16_mask + #define _mm_cmpeq_epi16_mask(a, b) simde_mm_cmpeq_epi16_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_mask_cmpeq_epi16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpeq_epi16_mask(k1, a, b); + #else + return simde_mm_cmpeq_epi16_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpeq_epi16_mask + #define _mm_mask_cmpeq_epi16_mask(k1, a, b) simde_mm_mask_cmpeq_epi16_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm256_cmpeq_epi16_mask(simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpeq_epi16_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return simde_mm256_movepi16_mask(simde_mm256_cmpeq_epi16(a, b)); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask16 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i16), a_.i16 == b_.i16); + r = simde_mm256_movepi16_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] == b_.u16[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpeq_epi16_mask + #define _mm256_cmpeq_epi16_mask(a, b) simde_mm256_cmpeq_epi16_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm256_mask_cmpeq_epi16_mask(simde__mmask16 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpeq_epi16_mask(k1, a, b); + #else + return simde_mm256_cmpeq_epi16_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpeq_epi16_mask + #define _mm256_mask_cmpeq_epi16_mask(k1, a, b) simde_mm256_mask_cmpeq_epi16_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmpeq_epi16_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpeq_epi16_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask32 r; + + #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { + const uint32_t t = HEDLEY_STATIC_CAST(uint32_t, simde_mm256_movepi16_mask(simde_mm256_cmpeq_epi16(a_.m256i[i], b_.m256i[i]))); + r |= HEDLEY_STATIC_CAST(uint64_t, t) << (i * 16); + } + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i16), a_.i16 == b_.i16); + r = simde_mm512_movepi16_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] == b_.u16[i]) ? (UINT32_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpeq_epi16_mask + #define _mm512_cmpeq_epi16_mask(a, b) simde_mm512_cmpeq_epi16_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_mask_cmpeq_epi16_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_cmpeq_epi16_mask(k1, a, b); + #else + return simde_mm512_cmpeq_epi16_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpeq_epi16_mask + #define _mm512_mask_cmpeq_epi16_mask(k1, a, b) simde_mm512_mask_cmpeq_epi16_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_cmpeq_epi32_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpeq_epi32_mask(a, b); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i32), a_.i32 == b_.i32); + r = simde_mm_movepi32_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= (a_.u32[i] == b_.u32[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpeq_epi32_mask + #define _mm_cmpeq_epi32_mask(a, b) simde_mm_cmpeq_epi32_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_mask_cmpeq_epi32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpeq_epi32_mask(k1, a, b); + #else + return simde_mm_cmpeq_epi32_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpeq_epi32_mask + #define _mm_mask_cmpeq_epi32_mask(k1, a, b) simde_mm_mask_cmpeq_epi32_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_cmpeq_epi32_mask(simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpeq_epi32_mask(a, b); + #else + simde__m256i r = simde_mm256_cmpeq_epi32(a, b); + return simde_mm256_movepi32_mask(r); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpeq_epi32_mask + #define _mm256_cmpeq_epi32_mask(a, b) simde_mm256_cmpeq_epi32_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_mask_cmpeq_epi32_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpeq_epi32_mask(k1, a, b); + #else + return simde_mm256_cmpeq_epi32_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpeq_epi32_mask + #define _mm256_mask_cmpeq_epi32_mask(k1, a, b) simde_mm256_mask_cmpeq_epi32_mask(k1, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__mmask16 -simde_mm512_cmpeq_epi32_mask (simde__m512i a, simde__m512i b) { +simde_mm512_cmpeq_epi32_mask(simde__m512i a, simde__m512i b) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_cmpeq_epi32_mask(a, b); #else @@ -118,7 +441,7 @@ simde_mm512_cmpeq_epi32_mask (simde__m512i a, simde__m512i b) { SIMDE_FUNCTION_ATTRIBUTES simde__mmask16 -simde_mm512_mask_cmpeq_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) { +simde_mm512_mask_cmpeq_epi32_mask(simde__mmask16 k1, simde__m512i a, simde__m512i b) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_mask_cmpeq_epi32_mask(k1, a, b); #else @@ -132,7 +455,83 @@ simde_mm512_mask_cmpeq_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m51 SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm512_cmpeq_epi64_mask (simde__m512i a, simde__m512i b) { +simde_mm_cmpeq_epi64_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpeq_epi64_mask(a, b); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i64), a_.i64 == b_.i64); + r = simde_mm_movepi64_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= (a_.u64[i] == b_.u64[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpeq_epi64_mask + #define _mm_cmpeq_epi64_mask(a, b) simde_mm_cmpeq_epi64_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_mask_cmpeq_epi64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpeq_epi64_mask(k1, a, b); + #else + return simde_mm_cmpeq_epi64_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpeq_epi64_mask + #define _mm_mask_cmpeq_epi64_mask(k1, a, b) simde_mm_mask_cmpeq_epi64_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_cmpeq_epi64_mask(simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpeq_epi64_mask(a, b); + #else + simde__m256i r = simde_mm256_cmpeq_epi64(a, b); + return simde_mm256_movepi64_mask(r); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpeq_epi64_mask + #define _mm256_cmpeq_epi64_mask(a, b) simde_mm256_cmpeq_epi64_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_mask_cmpeq_epi64_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpeq_epi64_mask(k1, a, b); + #else + return simde_mm256_cmpeq_epi64_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpeq_epi64_mask + #define _mm256_mask_cmpeq_epi64_mask(k1, a, b) simde_mm256_mask_cmpeq_epi64_mask(k1, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_cmpeq_epi64_mask(simde__m512i a, simde__m512i b) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_cmpeq_epi64_mask(a, b); #else @@ -155,7 +554,7 @@ simde_mm512_cmpeq_epi64_mask (simde__m512i a, simde__m512i b) { SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm512_mask_cmpeq_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512i b) { +simde_mm512_mask_cmpeq_epi64_mask(simde__mmask8 k1, simde__m512i a, simde__m512i b) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_mask_cmpeq_epi64_mask(k1, a, b); #else @@ -168,55 +567,587 @@ simde_mm512_mask_cmpeq_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512 #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmpeq_epu16_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpeq_epu16_mask(a, b); +simde__mmask16 +simde_mm_cmpeq_epu8_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpeq_epu8_mask(a, b); #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask32 r; + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask16 r; #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m512i_private tmp; + simde__m128i_private tmp; - tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 == b_.u16); - r = simde_mm512_movepi16_mask(simde__m512i_from_private(tmp)); + tmp.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u8), a_.u8 == b_.u8); + r = simde_mm_movepi8_mask(simde__m128i_from_private(tmp)); #else r = 0; SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - r |= (a_.u16[i] == b_.u16[i]) ? (UINT16_C(1) << i) : 0; + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] == b_.u8[i]) ? (UINT16_C(1) << i) : 0; } #endif return r; #endif } -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpeq_epu16_mask - #define _mm512_cmpeq_epu16_mask(a, b) simde_mm512_cmpeq_epu16_mask((a), (b)) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpeq_epu8_mask + #define _mm_cmpeq_epu8_mask(a, b) simde_mm_cmpeq_epu8_mask(a, b) #endif - SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_mask_cmpeq_epu16_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmpeq_epu16_mask(k1, a, b); +simde__mmask16 +simde_mm_mask_cmpeq_epu8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpeq_epu8_mask(k1, a, b); #else - return k1 & simde_mm512_cmpeq_epu16_mask(a, b); + return simde_mm_cmpeq_epu8_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpeq_epu16_mask - #define _mm512_mask_cmpeq_epu16_mask(k1, a, b) simde_mm512_mask_cmpeq_epu16_mask(k1, a, b) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpeq_epu8_mask + #define _mm_mask_cmpeq_epu8_mask(k1, a, b) simde_mm_mask_cmpeq_epu8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 +simde__mmask32 +simde_mm256_cmpeq_epu8_mask(simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpeq_epu8_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return HEDLEY_STATIC_CAST(simde__mmask32, simde_mm256_movemask_epi8(simde_x_mm256_cmpeq_epu8(a, b))); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask32 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u8), a_.u8 == b_.u8); + r = simde_mm256_movepi8_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] == b_.u8[i]) ? (UINT32_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpeq_epu8_mask + #define _mm256_cmpeq_epu8_mask(a, b) simde_mm256_cmpeq_epu8_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm256_mask_cmpeq_epu8_mask(simde__mmask32 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpeq_epu8_mask(k1, a, b); + #else + return simde_mm256_cmpeq_epu8_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpeq_epu8_mask + #define _mm256_mask_cmpeq_epu8_mask(k1, a, b) simde_mm256_mask_cmpeq_epu8_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_mm512_cmpeq_epu8_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpeq_epu8_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask64 r; + + #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { + const uint32_t t = HEDLEY_STATIC_CAST(uint32_t, simde_mm256_movemask_epi8(simde_x_mm256_cmpeq_epu8(a_.m256i[i], b_.m256i[i]))); + r |= HEDLEY_STATIC_CAST(uint64_t, t) << (i * 32); + } + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u8), a_.u8 == b_.u8); + r = simde_mm512_movepi8_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] == b_.u8[i]) ? (UINT64_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpeq_epu8_mask + #define _mm512_cmpeq_epu8_mask(a, b) simde_mm512_cmpeq_epu8_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_mm512_mask_cmpeq_epu8_mask(simde__mmask64 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_cmpeq_epu8_mask(k1, a, b); + #else + return simde_mm512_cmpeq_epu8_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpeq_epu8_mask + #define _mm512_mask_cmpeq_epu8_mask(k1, a, b) simde_mm512_mask_cmpeq_epu8_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_cmpeq_epu16_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpeq_epu16_mask(a, b); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 == b_.u16); + r = simde_mm_movepi16_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] == b_.u16[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpeq_epu16_mask + #define _mm_cmpeq_epu16_mask(a, b) simde_mm_cmpeq_epu16_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_mask_cmpeq_epu16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpeq_epu16_mask(k1, a, b); + #else + return simde_mm_cmpeq_epu16_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpeq_epu16_mask + #define _mm_mask_cmpeq_epu16_mask(k1, a, b) simde_mm_mask_cmpeq_epu16_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm256_cmpeq_epu16_mask(simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpeq_epu16_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return simde_mm256_movepi16_mask(simde_x_mm256_cmpeq_epu16(a, b)); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask16 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 == b_.u16); + r = simde_mm256_movepi16_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] == b_.u16[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpeq_epu16_mask + #define _mm256_cmpeq_epu16_mask(a, b) simde_mm256_cmpeq_epu16_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm256_mask_cmpeq_epu16_mask(simde__mmask16 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpeq_epu16_mask(k1, a, b); + #else + return simde_mm256_cmpeq_epu16_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpeq_epu16_mask + #define _mm256_mask_cmpeq_epu16_mask(k1, a, b) simde_mm256_mask_cmpeq_epu16_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmpeq_epu16_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpeq_epu16_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask32 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 == b_.u16); + r = simde_mm512_movepi16_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] == b_.u16[i]) ? (UINT32_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpeq_epu16_mask + #define _mm512_cmpeq_epu16_mask(a, b) simde_mm512_cmpeq_epu16_mask((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_mask_cmpeq_epu16_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_cmpeq_epu16_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpeq_epu16_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpeq_epu16_mask + #define _mm512_mask_cmpeq_epu16_mask(k1, a, b) simde_mm512_mask_cmpeq_epu16_mask(k1, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_cmpeq_epu32_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpeq_epu32_mask(a, b); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u32), a_.u32 == b_.u32); + r = simde_mm_movepi32_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= (a_.u32[i] == b_.u32[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpeq_epu32_mask + #define _mm_cmpeq_epu32_mask(a, b) simde_mm_cmpeq_epu32_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_mask_cmpeq_epu32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpeq_epu32_mask(k1, a, b); + #else + return simde_mm_cmpeq_epu32_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpeq_epu32_mask + #define _mm_mask_cmpeq_epu32_mask(k1, a, b) simde_mm_mask_cmpeq_epu32_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_cmpeq_epu32_mask(simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpeq_epu32_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return simde_mm256_movepi32_mask(simde_x_mm256_cmpeq_epu32(a, b)); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u32), a_.u32 == b_.u32); + r = simde_mm256_movepi32_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= (a_.u32[i] == b_.u32[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpeq_epu32_mask + #define _mm256_cmpeq_epu32_mask(a, b) simde_mm256_cmpeq_epu32_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_mask_cmpeq_epu32_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpeq_epu32_mask(k1, a, b); + #else + return simde_mm256_cmpeq_epu32_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpeq_epu32_mask + #define _mm256_mask_cmpeq_epu32_mask(k1, a, b) simde_mm256_mask_cmpeq_epu32_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_cmpeq_epu32_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cmpeq_epu32_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask16 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u32), a_.u32 == b_.u32); + r = simde_mm512_movepi32_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= (a_.u32[i] == b_.u32[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpeq_epu32_mask + #define _mm512_cmpeq_epu32_mask(a, b) simde_mm512_cmpeq_epu32_mask((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_mask_cmpeq_epu32_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpeq_epu32_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpeq_epu32_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpeq_epu32_mask + #define _mm512_mask_cmpeq_epu32_mask(k1, a, b) simde_mm512_mask_cmpeq_epu32_mask(k1, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_cmpeq_epu64_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpeq_epu64_mask(a, b); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u64), a_.u64 == b_.u64); + r = simde_mm_movepi64_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= (a_.u64[i] == b_.u64[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpeq_epu64_mask + #define _mm_cmpeq_epu64_mask(a, b) simde_mm_cmpeq_epu64_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_mask_cmpeq_epu64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpeq_epu64_mask(k1, a, b); + #else + return simde_mm_cmpeq_epu64_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpeq_epu64_mask + #define _mm_mask_cmpeq_epu64_mask(k1, a, b) simde_mm_mask_cmpeq_epu64_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_cmpeq_epu64_mask (simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpeq_epu64_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return simde_mm256_movepi64_mask(simde_x_mm256_cmpeq_epu64(a, b)); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u64), a_.u64 == b_.u64); + r = simde_mm256_movepi64_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= (a_.u64[i] == b_.u64[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpeq_epu64_mask + #define _mm256_cmpeq_epu64_mask(a, b) simde_mm256_cmpeq_epu64_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_mask_cmpeq_epu64_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpeq_epu64_mask(k1, a, b); + #else + return simde_mm256_cmpeq_epu64_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpeq_epu64_mask + #define _mm256_mask_cmpeq_epu64_mask(k1, a, b) simde_mm256_mask_cmpeq_epu64_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_cmpeq_epu64_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cmpeq_epu64_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u64), a_.u64 == b_.u64); + r = simde_mm512_movepi64_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= (a_.u64[i] == b_.u64[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpeq_epu64_mask + #define _mm512_cmpeq_epu64_mask(a, b) simde_mm512_cmpeq_epu64_mask((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_mask_cmpeq_epu64_mask(simde__mmask8 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpeq_epu64_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpeq_epu64_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpeq_epu64_mask + #define _mm512_mask_cmpeq_epu64_mask(k1, a, b) simde_mm512_mask_cmpeq_epu64_mask(k1, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 simde_mm512_cmpeq_ps_mask (simde__m512 a, simde__m512 b) { return simde_mm512_cmp_ps_mask(a, b, SIMDE_CMP_EQ_OQ); } @@ -225,6 +1156,20 @@ simde_mm512_cmpeq_ps_mask (simde__m512 a, simde__m512 b) { #define _mm512_cmpeq_ps_mask(a, b) simde_mm512_cmp_ps_mask(a, b, SIMDE_CMP_EQ_OQ) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_mask_cmpeq_ps_mask(simde__mmask16 k1, simde__m512 a, simde__m512 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpeq_ps_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpeq_ps_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpeq_ps_mask + #define _mm512_mask_cmpeq_ps_mask(k1, a, b) simde_mm512_mask_cmpeq_ps_mask(k1, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 simde_mm512_cmpeq_pd_mask (simde__m512d a, simde__m512d b) { @@ -235,6 +1180,20 @@ simde_mm512_cmpeq_pd_mask (simde__m512d a, simde__m512d b) { #define _mm512_cmpeq_pd_mask(a, b) simde_mm512_cmp_pd_mask(a, b, SIMDE_CMP_EQ_OQ) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_mask_cmpeq_pd_mask(simde__mmask8 k1, simde__m512d a, simde__m512d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpeq_pd_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpeq_pd_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpeq_pd_mask + #define _mm512_mask_cmpeq_pd_mask(k1, a, b) simde_mm512_mask_cmpeq_pd_mask(k1, a, b) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/x86/avx512/cmpneq.h b/thirdparty/simde/x86/avx512/cmpneq.h index 6e9bf3364..9bac11c28 100644 --- a/thirdparty/simde/x86/avx512/cmpneq.h +++ b/thirdparty/simde/x86/avx512/cmpneq.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Evan Nemerson + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_CMPNEQ_H) @@ -30,6 +31,7 @@ #include "types.h" #include "../avx2.h" #include "mov.h" +#include "cmp.h" #include "mov_mask.h" HEDLEY_DIAGNOSTIC_PUSH @@ -42,7 +44,26 @@ simde_mm_cmpneq_epi8_mask(simde__m128i a, simde__m128i b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) return _mm_cmpneq_epi8_mask(a, b); #else - return ~simde_mm_movepi8_mask(simde_mm_cmpeq_epi8(a, b)); + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask16 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.i8 != b_.i8); + r = simde_mm_movepi8_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] != b_.u8[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) @@ -65,31 +86,105 @@ simde_mm_mask_cmpneq_epi8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_cmpneq_epu8_mask(simde__m128i a, simde__m128i b) { +simde__mmask32 +simde_mm256_cmpneq_epi8_mask(simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpneq_epu8_mask(a, b); + return _mm256_cmpneq_epi8_mask(a, b); #else - return simde_mm_cmpneq_epi8_mask(a, b); + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask32 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.i8 != b_.i8); + r = simde_mm256_movepi8_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] != b_.u8[i]) ? (UINT32_C(1) << i) : 0; + } + #endif + + return r; #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epu8_mask - #define _mm_cmpneq_epu8_mask(a, b) simde_mm_cmpneq_epu8_mask((a), (b)) + #undef _mm256_cmpneq_epi8_mask + #define _mm256_cmpneq_epi8_mask(a, b) simde_mm256_cmpneq_epi8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_mask_cmpneq_epu8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b) { +simde__mmask32 +simde_mm256_mask_cmpneq_epi8_mask(simde__mmask32 k1, simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpneq_epu8_mask(k1, a, b); + return _mm256_mask_cmpneq_epi8_mask(k1, a, b); #else - return simde_mm_mask_cmpneq_epi8_mask(k1, a, b); + return simde_mm256_cmpneq_epi8_mask(a, b) & k1; #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epu8_mask - #define _mm_mask_cmpneq_epu8_mask(k1, a, b) simde_mm_mask_cmpneq_epu8_mask((k1), (a), (b)) + #undef _mm256_mask_cmpneq_epi8_mask + #define _mm256_mask_cmpneq_epi8_mask(k1, a, b) simde_mm256_mask_cmpneq_epi8_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_mm512_cmpneq_epi8_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpneq_epi8_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask64 r; + + #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { + const uint32_t t = HEDLEY_STATIC_CAST(uint32_t, simde_mm256_movemask_epi8(simde_x_mm256_cmpneq_epi8(a_.m256i[i], b_.m256i[i]))); + r |= HEDLEY_STATIC_CAST(uint64_t, t) << (i * 32); + } + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.i8 != b_.i8); + r = simde_mm512_movepi8_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] != b_.u8[i]) ? (UINT64_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_epi8_mask + #define _mm512_cmpneq_epi8_mask(a, b) simde_mm512_cmpneq_epi8_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_mm512_mask_cmpneq_epi8_mask(simde__mmask64 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_cmpneq_epi8_mask(k1, a, b); + #else + return simde_mm512_cmpneq_epi8_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_epi8_mask + #define _mm512_mask_cmpneq_epi8_mask(k1, a, b) simde_mm512_mask_cmpneq_epi8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -98,7 +193,26 @@ simde_mm_cmpneq_epi16_mask(simde__m128i a, simde__m128i b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) return _mm_cmpneq_epi16_mask(a, b); #else - return ~simde_mm_movepi16_mask(simde_mm_cmpeq_epi16(a, b)); + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i16), a_.i16 != b_.i16); + r = simde_mm_movepi16_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] != b_.u16[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) @@ -121,43 +235,136 @@ simde_mm_mask_cmpneq_epi16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpneq_epu16_mask(simde__m128i a, simde__m128i b) { +simde__mmask16 +simde_mm256_cmpneq_epi16_mask(simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpneq_epu16_mask(a, b); + return _mm256_cmpneq_epi16_mask(a, b); #else - return simde_mm_cmpneq_epi16_mask(a, b); + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask16 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i16), a_.i16 != b_.i16); + r = simde_mm256_movepi16_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] != b_.u16[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epu16_mask - #define _mm_cmpneq_epu16_mask(a, b) simde_mm_cmpneq_epu16_mask((a), (b)) + #undef _mm256_cmpneq_epi16_mask + #define _mm256_cmpneq_epi16_mask(a, b) simde_mm256_cmpneq_epi16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpneq_epu16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { +simde__mmask16 +simde_mm256_mask_cmpneq_epi16_mask(simde__mmask16 k1, simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpneq_epu16_mask(k1, a, b); + return _mm256_mask_cmpneq_epi16_mask(k1, a, b); #else - return simde_mm_mask_cmpneq_epi16_mask(k1, a, b); + return simde_mm256_cmpneq_epi16_mask(a, b) & k1; #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epu16_mask - #define _mm_mask_cmpneq_epu16_mask(k1, a, b) simde_mm_mask_cmpneq_epu16_mask((k1), (a), (b)) + #undef _mm256_mask_cmpneq_epi16_mask + #define _mm256_mask_cmpneq_epi16_mask(k1, a, b) simde_mm256_mask_cmpneq_epi16_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmpneq_epi16_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpneq_epi16_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask32 r; + + #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { + const uint32_t t = HEDLEY_STATIC_CAST(uint32_t, simde_mm256_movepi16_mask(simde_x_mm256_cmpneq_epi16(a_.m256i[i], b_.m256i[i]))); + r |= HEDLEY_STATIC_CAST(uint64_t, t) << (i * 16); + } + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i16), a_.i16 != b_.i16); + r = simde_mm512_movepi16_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] != b_.u16[i]) ? (UINT32_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_epi16_mask + #define _mm512_cmpneq_epi16_mask(a, b) simde_mm512_cmpneq_epi16_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_mask_cmpneq_epi16_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_cmpneq_epi16_mask(k1, a, b); + #else + return simde_mm512_cmpneq_epi16_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_epi16_mask + #define _mm512_mask_cmpneq_epi16_mask(k1, a, b) simde_mm512_mask_cmpneq_epi16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 simde_mm_cmpneq_epi32_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_cmpneq_epi32_mask(a, b); #else - return (~simde_mm_movepi32_mask(simde_mm_cmpeq_epi32(a, b))) & 15; + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i32), a_.i32 != b_.i32); + r = simde_mm_movepi32_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= (a_.u32[i] != b_.u32[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_cmpneq_epi32_mask #define _mm_cmpneq_epi32_mask(a, b) simde_mm_cmpneq_epi32_mask((a), (b)) #endif @@ -165,55 +372,112 @@ simde_mm_cmpneq_epi32_mask(simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 simde_mm_mask_cmpneq_epi32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_mask_cmpneq_epi32_mask(k1, a, b); #else return simde_mm_cmpneq_epi32_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi32_mask #define _mm_mask_cmpneq_epi32_mask(k1, a, b) simde_mm_mask_cmpneq_epi32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm_cmpneq_epu32_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpneq_epu32_mask(a, b); +simde_mm256_cmpneq_epi32_mask(simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpneq_epi32_mask(a, b); #else - return simde_mm_cmpneq_epi32_mask(a, b); + simde__m256i r = simde_x_mm256_cmpneq_epi32(a, b); + return simde_mm256_movepi32_mask(r); #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epu32_mask - #define _mm_cmpneq_epu32_mask(a, b) simde_mm_cmpneq_epu32_mask((a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpneq_epi32_mask + #define _mm256_cmpneq_epi32_mask(a, b) simde_mm256_cmpneq_epi32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm_mask_cmpneq_epu32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpneq_epu32_mask(k1, a, b); +simde_mm256_mask_cmpneq_epi32_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpneq_epi32_mask(k1, a, b); #else - return simde_mm_mask_cmpneq_epi32_mask(k1, a, b); + return simde_mm256_cmpneq_epi32_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epu32_mask - #define _mm_mask_cmpneq_epu32_mask(k1, a, b) simde_mm_mask_cmpneq_epu32_mask((k1), (a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpneq_epi32_mask + #define _mm256_mask_cmpneq_epi32_mask(k1, a, b) simde_mm256_mask_cmpneq_epi32_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_cmpneq_epi32_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cmpneq_epi32_mask(a, b); + #else + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { + r_.m256i[i] = simde_x_mm256_cmpneq_epi32(a_.m256i[i], b_.m256i[i]); + } + + return simde_mm512_movepi32_mask(simde__m512i_from_private(r_)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_epi32_mask + #define _mm512_cmpneq_epi32_mask(a, b) simde_mm512_cmpneq_epi32_mask(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_mask_cmpneq_epi32_mask(simde__mmask16 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpneq_epi32_mask(k1, a, b); + #else + return simde_mm512_cmpneq_epi32_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_epi32_mask + #define _mm512_mask_cmpneq_epi32_mask(k1, a, b) simde_mm512_mask_cmpneq_epi32_mask(k1, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 simde_mm_cmpneq_epi64_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_cmpneq_epi64_mask(a, b); #else - return (~simde_mm_movepi64_mask(simde_mm_cmpeq_epi64(a, b))) & 3; + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i64), a_.i64 != b_.i64); + r = simde_mm_movepi64_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= (a_.u64[i] != b_.u64[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_cmpneq_epi64_mask #define _mm_cmpneq_epi64_mask(a, b) simde_mm_cmpneq_epi64_mask((a), (b)) #endif @@ -221,83 +485,161 @@ simde_mm_cmpneq_epi64_mask(simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 simde_mm_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_mask_cmpneq_epi64_mask(k1, a, b); #else return simde_mm_cmpneq_epi64_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi64_mask #define _mm_mask_cmpneq_epi64_mask(k1, a, b) simde_mm_mask_cmpneq_epi64_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm_cmpneq_epu64_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpneq_epu64_mask(a, b); +simde_mm256_cmpneq_epi64_mask(simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_cmpneq_epi64_mask(a, b); #else - return simde_mm_cmpneq_epi64_mask(a, b); + simde__m256i r = simde_x_mm256_cmpneq_epi64(a, b); + return simde_mm256_movepi64_mask(r); #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epu64_mask - #define _mm_cmpneq_epu64_mask(a, b) simde_mm_cmpneq_epu64_mask((a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmpneq_epi64_mask + #define _mm256_cmpneq_epi64_mask(a, b) simde_mm256_cmpneq_epi64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpneq_epu64_mask(k1, a, b); +simde_mm256_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_cmpneq_epi64_mask(k1, a, b); #else - return simde_mm_mask_cmpneq_epi64_mask(k1, a, b); + return simde_mm256_cmpneq_epi64_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epu64_mask - #define _mm_mask_cmpneq_epu64_mask(k1, a, b) simde_mm_mask_cmpneq_epu64_mask((k1), (a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmpneq_epi64_mask + #define _mm256_mask_cmpneq_epi64_mask(k1, a, b) simde_mm256_mask_cmpneq_epi64_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_cmpneq_epi8_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpneq_epi8_mask(a, b); +simde__mmask8 +simde_mm512_cmpneq_epi64_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cmpneq_epi64_mask(a, b); #else - return ~simde_mm256_movepi8_mask(simde_mm256_cmpeq_epi8(a, b)); + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { + r_.m256i[i] = simde_x_mm256_cmpneq_epi64(a_.m256i[i], b_.m256i[i]); + } + + return simde_mm512_movepi64_mask(simde__m512i_from_private(r_)); #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epi8_mask - #define _mm256_cmpneq_epi8_mask(a, b) simde_mm256_cmpneq_epi8_mask((a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_epi64_mask + #define _mm512_cmpneq_epi64_mask(a, b) simde_mm512_cmpneq_epi64_mask(a, b) #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_mask_cmpneq_epi8_mask(simde__mmask32 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpneq_epi8_mask(k1, a, b); +simde__mmask8 +simde_mm512_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpneq_epi64_mask(k1, a, b); #else - return simde_mm256_cmpneq_epi8_mask(a, b) & k1; + return simde_mm512_cmpneq_epi64_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epi8_mask - #define _mm256_mask_cmpneq_epi8_mask(k1, a, b) simde_mm256_mask_cmpneq_epi8_mask((k1), (a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_epi64_mask + #define _mm512_mask_cmpneq_epi64_mask(k1, a, b) simde_mm512_mask_cmpneq_epi64_mask(k1, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm_cmpneq_epu8_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpneq_epu8_mask(a, b); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask16 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u8), a_.u8 != b_.u8); + r = simde_mm_movepi8_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] != b_.u8[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpneq_epu8_mask + #define _mm_cmpneq_epu8_mask(a, b) simde_mm_cmpneq_epu8_mask((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm_mask_cmpneq_epu8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpneq_epu8_mask(k1, a, b); + #else + return simde_mm_cmpneq_epu8_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpneq_epu8_mask + #define _mm_mask_cmpneq_epu8_mask(k1, a, b) simde_mm_mask_cmpneq_epu8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask32 simde_mm256_cmpneq_epu8_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_cmpneq_epu8_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return HEDLEY_STATIC_CAST(simde__mmask32, simde_mm256_movemask_epi8(simde_x_mm256_cmpneq_epu8(a, b))); #else - return simde_mm256_cmpneq_epi8_mask(a, b); + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask32 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u8), a_.u8 != b_.u8); + r = simde_mm256_movepi8_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] != b_.u8[i]) ? (UINT32_C(1) << i) : 0; + } + #endif + + return r; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_cmpneq_epu8_mask #define _mm256_cmpneq_epu8_mask(a, b) simde_mm256_cmpneq_epu8_mask((a), (b)) #endif @@ -305,55 +647,150 @@ simde_mm256_cmpneq_epu8_mask(simde__m256i a, simde__m256i b) { SIMDE_FUNCTION_ATTRIBUTES simde__mmask32 simde_mm256_mask_cmpneq_epu8_mask(simde__mmask32 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_cmpneq_epu8_mask(k1, a, b); #else - return simde_mm256_mask_cmpneq_epi8_mask(k1, a, b); + return simde_mm256_cmpneq_epu8_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu8_mask #define _mm256_mask_cmpneq_epu8_mask(k1, a, b) simde_mm256_mask_cmpneq_epu8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_cmpneq_epi16_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpneq_epi16_mask(a, b); +simde__mmask64 +simde_mm512_cmpneq_epu8_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpneq_epu8_mask(a, b); #else - return ~simde_mm256_movepi16_mask(simde_mm256_cmpeq_epi16(a, b)); + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask64 r; + + #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { + const uint32_t t = HEDLEY_STATIC_CAST(uint32_t, simde_mm256_movemask_epi8(simde_x_mm256_cmpneq_epu8(a_.m256i[i], b_.m256i[i]))); + r |= HEDLEY_STATIC_CAST(uint64_t, t) << (i * 32); + } + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u8), a_.u8 != b_.u8); + r = simde_mm512_movepi8_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { + r |= (a_.u8[i] != b_.u8[i]) ? (UINT64_C(1) << i) : 0; + } + #endif + + return r; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epi16_mask - #define _mm256_cmpneq_epi16_mask(a, b) simde_mm256_cmpneq_epi16_mask((a), (b)) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_epu8_mask + #define _mm512_cmpneq_epu8_mask(a, b) simde_mm512_cmpneq_epu8_mask(a, b) #endif SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_mask_cmpneq_epi16_mask(simde__mmask16 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpneq_epi16_mask(k1, a, b); +simde__mmask64 +simde_mm512_mask_cmpneq_epu8_mask(simde__mmask64 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_cmpneq_epu8_mask(k1, a, b); #else - return simde_mm256_cmpneq_epi16_mask(a, b) & k1; + return simde_mm512_cmpneq_epu8_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epi16_mask - #define _mm256_mask_cmpneq_epi16_mask(k1, a, b) simde_mm256_mask_cmpneq_epi16_mask((k1), (a), (b)) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_epu8_mask + #define _mm512_mask_cmpneq_epu8_mask(k1, a, b) simde_mm512_mask_cmpneq_epu8_mask((k1), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_cmpneq_epu16_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpneq_epu16_mask(a, b); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 != b_.u16); + r = simde_mm_movepi16_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] != b_.u16[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpneq_epu16_mask + #define _mm_cmpneq_epu16_mask(a, b) simde_mm_cmpneq_epu16_mask((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm_mask_cmpneq_epu16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpneq_epu16_mask(k1, a, b); + #else + return simde_mm_cmpneq_epu16_mask(a, b) & k1; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpneq_epu16_mask + #define _mm_mask_cmpneq_epu16_mask(k1, a, b) simde_mm_mask_cmpneq_epu16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask16 simde_mm256_cmpneq_epu16_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_cmpneq_epu16_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return simde_mm256_movepi16_mask(simde_x_mm256_cmpneq_epu16(a, b)); #else - return simde_mm256_cmpneq_epi16_mask(a, b); + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask16 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 != b_.u16); + r = simde_mm256_movepi16_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] != b_.u16[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_cmpneq_epu16_mask #define _mm256_cmpneq_epu16_mask(a, b) simde_mm256_cmpneq_epu16_mask((a), (b)) #endif @@ -361,55 +798,142 @@ simde_mm256_cmpneq_epu16_mask(simde__m256i a, simde__m256i b) { SIMDE_FUNCTION_ATTRIBUTES simde__mmask16 simde_mm256_mask_cmpneq_epu16_mask(simde__mmask16 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_cmpneq_epu16_mask(k1, a, b); #else - return simde_mm256_mask_cmpneq_epi16_mask(k1, a, b); + return simde_mm256_cmpneq_epu16_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu16_mask #define _mm256_mask_cmpneq_epu16_mask(k1, a, b) simde_mm256_mask_cmpneq_epu16_mask((k1), (a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmpneq_epu16_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpneq_epu16_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask32 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 != b_.u16); + r = simde_mm512_movepi16_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] != b_.u16[i]) ? (UINT32_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_epu16_mask + #define _mm512_cmpneq_epu16_mask(a, b) simde_mm512_cmpneq_epu16_mask((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_mask_cmpneq_epu16_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_cmpneq_epu16_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpneq_epu16_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_epu16_mask + #define _mm512_mask_cmpneq_epu16_mask(k1, a, b) simde_mm512_mask_cmpneq_epu16_mask(k1, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm256_cmpneq_epi32_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpneq_epi32_mask(a, b); +simde_mm_cmpneq_epu32_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpneq_epu32_mask(a, b); #else - return (~simde_mm256_movepi32_mask(simde_mm256_cmpeq_epi32(a, b))); + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u32), a_.u32 != b_.u32); + r = simde_mm_movepi32_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= (a_.u32[i] != b_.u32[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epi32_mask - #define _mm256_cmpneq_epi32_mask(a, b) simde_mm256_cmpneq_epi32_mask((a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpneq_epu32_mask + #define _mm_cmpneq_epu32_mask(a, b) simde_mm_cmpneq_epu32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm256_mask_cmpneq_epi32_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpneq_epi32_mask(k1, a, b); +simde_mm_mask_cmpneq_epu32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpneq_epu32_mask(k1, a, b); #else - return simde_mm256_cmpneq_epi32_mask(a, b) & k1; + return simde_mm_cmpneq_epu32_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epi32_mask - #define _mm256_mask_cmpneq_epi32_mask(k1, a, b) simde_mm256_mask_cmpneq_epi32_mask((k1), (a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpneq_epu32_mask + #define _mm_mask_cmpneq_epu32_mask(k1, a, b) simde_mm_mask_cmpneq_epu32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 simde_mm256_cmpneq_epu32_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_cmpneq_epu32_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return simde_mm256_movepi32_mask(simde_x_mm256_cmpneq_epu32(a, b)); #else - return simde_mm256_cmpneq_epi32_mask(a, b); + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u32), a_.u32 != b_.u32); + r = simde_mm256_movepi32_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= (a_.u32[i] != b_.u32[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_cmpneq_epu32_mask #define _mm256_cmpneq_epu32_mask(a, b) simde_mm256_cmpneq_epu32_mask((a), (b)) #endif @@ -417,55 +941,142 @@ simde_mm256_cmpneq_epu32_mask(simde__m256i a, simde__m256i b) { SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 simde_mm256_mask_cmpneq_epu32_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_cmpneq_epu32_mask(k1, a, b); #else - return simde_mm256_mask_cmpneq_epi32_mask(k1, a, b); + return simde_mm256_cmpneq_epu32_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu32_mask #define _mm256_mask_cmpneq_epu32_mask(k1, a, b) simde_mm256_mask_cmpneq_epu32_mask((k1), (a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_cmpneq_epu32_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cmpneq_epu32_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask16 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u32), a_.u32 != b_.u32); + r = simde_mm512_movepi32_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r |= (a_.u32[i] != b_.u32[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_epu32_mask + #define _mm512_cmpneq_epu32_mask(a, b) simde_mm512_cmpneq_epu32_mask((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_mask_cmpneq_epu32_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpneq_epu32_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpneq_epu32_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_epu32_mask + #define _mm512_mask_cmpneq_epu32_mask(k1, a, b) simde_mm512_mask_cmpneq_epu32_mask(k1, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm256_cmpneq_epi64_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpneq_epi64_mask(a, b); +simde_mm_cmpneq_epu64_mask(simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_cmpneq_epu64_mask(a, b); #else - return (~simde_mm256_movepi64_mask(simde_mm256_cmpeq_epi64(a, b))) & 15; + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m128i_private tmp; + + tmp.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u64), a_.u64 != b_.u64); + r = simde_mm_movepi64_mask(simde__m128i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= (a_.u64[i] != b_.u64[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epi64_mask - #define _mm256_cmpneq_epi64_mask(a, b) simde_mm256_cmpneq_epi64_mask((a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpneq_epu64_mask + #define _mm_cmpneq_epu64_mask(a, b) simde_mm_cmpneq_epu64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 -simde_mm256_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpneq_epi64_mask(k1, a, b); +simde_mm_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_cmpneq_epu64_mask(k1, a, b); #else - return simde_mm256_cmpneq_epi64_mask(a, b) & k1; + return simde_mm_cmpneq_epu64_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epi64_mask - #define _mm256_mask_cmpneq_epi64_mask(k1, a, b) simde_mm256_mask_cmpneq_epi64_mask((k1), (a), (b)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_cmpneq_epu64_mask + #define _mm_mask_cmpneq_epu64_mask(k1, a, b) simde_mm_mask_cmpneq_epu64_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 simde_mm256_cmpneq_epu64_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_cmpneq_epu64_mask(a, b); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) + return simde_mm256_movepi64_mask(simde_x_mm256_cmpneq_epu64(a, b)); #else - return simde_mm256_cmpneq_epi64_mask(a, b); + simde__m256i_private + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m256i_private tmp; + + tmp.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u64), a_.u64 != b_.u64); + r = simde_mm256_movepi64_mask(simde__m256i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= (a_.u64[i] != b_.u64[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_cmpneq_epu64_mask #define _mm256_cmpneq_epu64_mask(a, b) simde_mm256_cmpneq_epu64_mask((a), (b)) #endif @@ -473,17 +1084,112 @@ simde_mm256_cmpneq_epu64_mask(simde__m256i a, simde__m256i b) { SIMDE_FUNCTION_ATTRIBUTES simde__mmask8 simde_mm256_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_cmpneq_epu64_mask(k1, a, b); #else - return simde_mm256_mask_cmpneq_epi64_mask(k1, a, b); + return simde_mm256_cmpneq_epu64_mask(a, b) & k1; #endif } -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu64_mask #define _mm256_mask_cmpneq_epu64_mask(k1, a, b) simde_mm256_mask_cmpneq_epu64_mask((k1), (a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_cmpneq_epu64_mask(simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cmpneq_epu64_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask8 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u64), a_.u64 != b_.u64); + r = simde_mm512_movepi64_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r |= (a_.u64[i] != b_.u64[i]) ? (UINT8_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_epu64_mask + #define _mm512_cmpneq_epu64_mask(a, b) simde_mm512_cmpneq_epu64_mask((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpneq_epu64_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpneq_epu64_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_epu64_mask + #define _mm512_mask_cmpneq_epu64_mask(k1, a, b) simde_mm512_mask_cmpneq_epu64_mask(k1, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_cmpneq_ps_mask (simde__m512 a, simde__m512 b) { + return simde_mm512_cmp_ps_mask(a, b, SIMDE_CMP_NEQ_OQ); +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_ps_mask + #define _mm512_cmpneq_ps_mask(a, b) simde_mm512_cmp_ps_mask(a, b, SIMDE_CMP_NEQ_OQ) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_mask_cmpneq_ps_mask(simde__mmask16 k1, simde__m512 a, simde__m512 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpneq_ps_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpneq_ps_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_ps_mask + #define _mm512_mask_cmpneq_ps_mask(k1, a, b) simde_mm512_mask_cmpneq_ps_mask(k1, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_cmpneq_pd_mask (simde__m512d a, simde__m512d b) { + return simde_mm512_cmp_pd_mask(a, b, SIMDE_CMP_NEQ_OQ); +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpneq_pd_mask + #define _mm512_cmpneq_pd_mask(a, b) simde_mm512_cmp_pd_mask(a, b, SIMDE_CMP_NEQ_OQ) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_mask_cmpneq_pd_mask(simde__mmask8 k1, simde__m512d a, simde__m512d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_cmpneq_pd_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpneq_pd_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpneq_pd_mask + #define _mm512_mask_cmpneq_pd_mask(k1, a, b) simde_mm512_mask_cmpneq_pd_mask(k1, a, b) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/x86/avx512/compress.h b/thirdparty/simde/x86/avx512/compress.h index 06fffc733..882575825 100644 --- a/thirdparty/simde/x86/avx512/compress.h +++ b/thirdparty/simde/x86/avx512/compress.h @@ -1,3 +1,32 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2022 Paul Saab + * 2023-2026 Michael R. Crusoe + * 2025 Adrian Riedl + */ + #if !defined(SIMDE_X86_AVX512_COMPRESS_H) #define SIMDE_X86_AVX512_COMPRESS_H @@ -7,6 +36,362 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_mask_compress_pd (simde__m128d src, simde__mmask8 k, simde__m128d a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_mask_compress_pd(src, k, a); + #else + simde__m128d_private + a_ = simde__m128d_to_private(a), + src_ = simde__m128d_to_private(src); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + if ((k >> i) & 1) { + a_.f64[ri++] = a_.f64[i]; + } + } + + for ( ; ri < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; ri++) { + a_.f64[ri] = src_.f64[ri]; + } + + return simde__m128d_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_compress_pd + #define _mm_mask_compress_pd(src, k, a) simde_mm_mask_compress_pd(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m128d a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) + _mm_mask_compressstoreu_pd(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); + _mm_mask_storeu_pd(base_addr, store_mask, _mm_maskz_compress_pd(k, a)); + #else + simde__m128d_private + a_ = simde__m128d_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + if ((k >> i) & 1) { + a_.f64[ri++] = a_.f64[i]; + } + } + + simde_memcpy(base_addr, &a_, ri * sizeof(a_.f64[0])); + + return; + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_compressstoreu_pd + #define _mm_mask_compressstoreu_pd(base_addr, k, a) simde_mm_mask_compressstoreu_pd(base_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_maskz_compress_pd (simde__mmask8 k, simde__m128d a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_maskz_compress_pd(k, a); + #else + simde__m128d_private + a_ = simde__m128d_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + if ((k >> i) & 1) { + a_.f64[ri++] = a_.f64[i]; + } + } + + for ( ; ri < (sizeof(a_.f64) / sizeof(a_.f64[0])); ri++) { + a_.f64[ri] = SIMDE_FLOAT64_C(0.0); + } + + return simde__m128d_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_compress_pd + #define _mm_maskz_compress_pd(k, a) simde_mm_maskz_compress_pd(k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_mask_compress_ps (simde__m128 src, simde__mmask8 k, simde__m128 a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_mask_compress_ps(src, k, a); + #else + simde__m128_private + a_ = simde__m128_to_private(a), + src_ = simde__m128_to_private(src); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + if ((k >> i) & 1) { + a_.f32[ri++] = a_.f32[i]; + } + } + + for ( ; ri < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; ri++) { + a_.f32[ri] = src_.f32[ri]; + } + + return simde__m128_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_compress_ps + #define _mm_mask_compress_ps(src, k, a) simde_mm_mask_compress_ps(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_compressstoreu_ps (void* base_addr, simde__mmask8 k, simde__m128 a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) + _mm_mask_compressstoreu_ps(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); + _mm_mask_storeu_ps(base_addr, store_mask, _mm_maskz_compress_ps(k, a)); + #else + simde__m128_private + a_ = simde__m128_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + if ((k >> i) & 1) { + a_.f32[ri++] = a_.f32[i]; + } + } + + simde_memcpy(base_addr, &a_, ri * sizeof(a_.f32[0])); + + return; + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_compressstoreu_ps + #define _mm_mask_compressstoreu_ps(base_addr, k, a) simde_mm_mask_compressstoreu_ps(base_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_maskz_compress_ps (simde__mmask8 k, simde__m128 a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_maskz_compress_ps(k, a); + #else + simde__m128_private + a_ = simde__m128_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + if ((k >> i) & 1) { + a_.f32[ri++] = a_.f32[i]; + } + } + + for ( ; ri < (sizeof(a_.f32) / sizeof(a_.f32[0])); ri++) { + a_.f32[ri] = SIMDE_FLOAT32_C(0.0); + } + + return simde__m128_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_compress_ps + #define _mm_maskz_compress_ps(k, a) simde_mm_maskz_compress_ps(k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_compress_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_mask_compress_epi32(src, k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + src_ = simde__m128i_to_private(src); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + if ((k >> i) & 1) { + a_.i32[ri++] = a_.i32[i]; + } + } + + for ( ; ri < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; ri++) { + a_.i32[ri] = src_.i32[ri]; + } + + return simde__m128i_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_compress_epi32 + #define _mm_mask_compress_epi32(src, k, a) simde_mm_mask_compress_epi32(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_compressstoreu_epi32 (void* base_addr, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) + _mm_mask_compressstoreu_epi32(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); + _mm_mask_storeu_epi32(base_addr, store_mask, _mm_maskz_compress_epi32(k, a)); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + if ((k >> i) & 1) { + a_.i32[ri++] = a_.i32[i]; + } + } + + simde_memcpy(base_addr, &a_, ri * sizeof(a_.i32[0])); + + return; + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_compressstoreu_epi32 + #define _mm_mask_compressstoreu_epi32(base_addr, k, a) simde_mm_mask_compressstoreu_epi32(base_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_compress_epi32 (simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_maskz_compress_epi32(k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + if ((k >> i) & 1) { + a_.i32[ri++] = a_.i32[i]; + } + } + + for ( ; ri < (sizeof(a_.i32) / sizeof(a_.i32[0])); ri++) { + a_.f32[ri] = INT32_C(0); + } + + return simde__m128i_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_compress_epi32 + #define _mm_maskz_compress_epi32(k, a) simde_mm_maskz_compress_epi32(k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_compress_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_mask_compress_epi64(src, k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + src_ = simde__m128i_to_private(src); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + if ((k >> i) & 1) { + a_.i64[ri++] = a_.i64[i]; + } + } + + for ( ; ri < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; ri++) { + a_.i64[ri] = src_.i64[ri]; + } + + return simde__m128i_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_compress_epi64 + #define _mm_mask_compress_epi64(src, k, a) simde_mm_mask_compress_epi64(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) + _mm_mask_compressstoreu_epi64(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); + _mm_mask_storeu_epi64(base_addr, store_mask, _mm_maskz_compress_epi64(k, a)); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + if ((k >> i) & 1) { + a_.i64[ri++] = a_.i64[i]; + } + } + + simde_memcpy(base_addr, &a_, ri * sizeof(a_.i64[0])); + + return; + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_compressstoreu_epi64 + #define _mm_mask_compressstoreu_epi64(base_addr, k, a) simde_mm_mask_compressstoreu_epi64(base_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_compress_epi64 (simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + return _mm_maskz_compress_epi64(k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + if ((k >> i) & 1) { + a_.i64[ri++] = a_.i64[i]; + } + } + + for ( ; ri < (sizeof(a_.i64) / sizeof(a_.i64[0])); ri++) { + a_.i64[ri] = INT64_C(0); + } + + return simde__m128i_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_compress_epi64 + #define _mm_maskz_compress_epi64(k, a) simde_mm_maskz_compress_epi64(k, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256d simde_mm256_mask_compress_pd (simde__m256d src, simde__mmask8 k, simde__m256d a) { @@ -43,7 +428,7 @@ simde_mm256_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m25 #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_pd(base_addr, k, a); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); _mm256_mask_storeu_pd(base_addr, store_mask, _mm256_maskz_compress_pd(k, a)); #else simde__m256d_private @@ -132,7 +517,7 @@ simde_mm256_mask_compressstoreu_ps (void* base_addr, simde__mmask8 k, simde__m25 #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_ps(base_addr, k, a); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); _mm256_mask_storeu_ps(base_addr, store_mask, _mm256_maskz_compress_ps(k, a)); #else simde__m256_private @@ -221,7 +606,7 @@ simde_mm256_mask_compressstoreu_epi32 (void* base_addr, simde__mmask8 k, simde__ #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_epi32(base_addr, k, a); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); _mm256_mask_storeu_epi32(base_addr, store_mask, _mm256_maskz_compress_epi32(k, a)); #else simde__m256i_private @@ -310,7 +695,7 @@ simde_mm256_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__ #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_epi64(base_addr, k, a); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); _mm256_mask_storeu_epi64(base_addr, store_mask, _mm256_maskz_compress_epi64(k, a)); #else simde__m256i_private @@ -399,7 +784,7 @@ simde_mm512_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m51 #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_pd(base_addr, k, a); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); _mm512_mask_storeu_pd(base_addr, store_mask, _mm512_maskz_compress_pd(k, a)); #else simde__m512d_private @@ -488,7 +873,7 @@ simde_mm512_mask_compressstoreu_ps (void* base_addr, simde__mmask16 k, simde__m5 #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_ps(base_addr, k, a); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask16 store_mask = _pext_u32(-1, k); + simde__mmask16 store_mask = HEDLEY_STATIC_CAST(simde__mmask16, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); _mm512_mask_storeu_ps(base_addr, store_mask, _mm512_maskz_compress_ps(k, a)); #else simde__m512_private @@ -577,7 +962,7 @@ simde_mm512_mask_compressstoreu_epi16 (void* base_addr, simde__mmask32 k, simde_ #if defined(SIMDE_X86_AVX512VBMI2_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_epi16(base_addr, k, a); #elif defined(SIMDE_X86_AVX512VBMI2_NATIVE) && defined(__znver4__) - simde__mmask32 store_mask = _pext_u32(-1, k); + simde__mmask32 store_mask = _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k); _mm512_mask_storeu_epi16(base_addr, store_mask, _mm512_maskz_compress_epi16(k, a)); #else simde__m512i_private @@ -607,7 +992,7 @@ simde_mm512_mask_compressstoreu_epi32 (void* base_addr, simde__mmask16 k, simde_ #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_epi32(base_addr, k, a); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask16 store_mask = _pext_u32(-1, k); + simde__mmask16 store_mask = HEDLEY_STATIC_CAST(simde__mmask16, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); _mm512_mask_storeu_epi32(base_addr, store_mask, _mm512_maskz_compress_epi32(k, a)); #else simde__m512i_private @@ -696,7 +1081,7 @@ simde_mm512_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__ #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_epi64(base_addr, k, a); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); + simde__mmask8 store_mask = HEDLEY_STATIC_CAST(simde__mmask8, _pext_u32(HEDLEY_STATIC_CAST(unsigned int, -1), k)); _mm512_mask_storeu_epi64(base_addr, store_mask, _mm512_maskz_compress_epi64(k, a)); #else simde__m512i_private diff --git a/thirdparty/simde/x86/avx512/conflict.h b/thirdparty/simde/x86/avx512/conflict.h index 239aef9b9..be661f99c 100644 --- a/thirdparty/simde/x86/avx512/conflict.h +++ b/thirdparty/simde/x86/avx512/conflict.h @@ -1,11 +1,38 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2025 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_CONFLICT_H) #define SIMDE_X86_AVX512_CONFLICT_H #include "types.h" -#include "mov_mask.h" #include "mov.h" #include "cmpeq.h" #include "set1.h" +#include "setzero.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS diff --git a/thirdparty/simde/x86/avx512/cvt.h b/thirdparty/simde/x86/avx512/cvt.h index 579bcac10..0e1682bb0 100644 --- a/thirdparty/simde/x86/avx512/cvt.h +++ b/thirdparty/simde/x86/avx512/cvt.h @@ -32,6 +32,7 @@ #include "types.h" #include "mov.h" +#include "setzero.h" #include "../../simde-f16.h" HEDLEY_DIAGNOSTIC_PUSH diff --git a/thirdparty/simde/x86/avx512/cvts.h b/thirdparty/simde/x86/avx512/cvts.h index 0194889a7..73aab5886 100644 --- a/thirdparty/simde/x86/avx512/cvts.h +++ b/thirdparty/simde/x86/avx512/cvts.h @@ -32,6 +32,7 @@ #include "types.h" #include "mov.h" #include "storeu.h" +#include "setzero.h" #include "loadu.h" HEDLEY_DIAGNOSTIC_PUSH diff --git a/thirdparty/simde/x86/avx512/dbsad.h b/thirdparty/simde/x86/avx512/dbsad.h index c9a8e660e..d6b3c6dfc 100644 --- a/thirdparty/simde/x86/avx512/dbsad.h +++ b/thirdparty/simde/x86/avx512/dbsad.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_DBSAD_H) #define SIMDE_X86_AVX512_DBSAD_H diff --git a/thirdparty/simde/x86/avx512/dpbf16.h b/thirdparty/simde/x86/avx512/dpbf16.h index 81e2aead2..05195b353 100644 --- a/thirdparty/simde/x86/avx512/dpbf16.h +++ b/thirdparty/simde/x86/avx512/dpbf16.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2023 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_DPBF16_H) #define SIMDE_X86_AVX512_DPBF16_H diff --git a/thirdparty/simde/x86/avx512/dpbusd.h b/thirdparty/simde/x86/avx512/dpbusd.h index c45f3ca30..6e5af7e29 100644 --- a/thirdparty/simde/x86/avx512/dpbusd.h +++ b/thirdparty/simde/x86/avx512/dpbusd.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_DPBUSD_H) #define SIMDE_X86_AVX512_DPBUSD_H diff --git a/thirdparty/simde/x86/avx512/dpbusds.h b/thirdparty/simde/x86/avx512/dpbusds.h index 0168fed2a..136b8af79 100644 --- a/thirdparty/simde/x86/avx512/dpbusds.h +++ b/thirdparty/simde/x86/avx512/dpbusds.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_DPBUSDS_H) #define SIMDE_X86_AVX512_DPBUSDS_H diff --git a/thirdparty/simde/x86/avx512/dpwssd.h b/thirdparty/simde/x86/avx512/dpwssd.h index 33b0ce55f..bd6b743b1 100644 --- a/thirdparty/simde/x86/avx512/dpwssd.h +++ b/thirdparty/simde/x86/avx512/dpwssd.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_DPWSSD_H) #define SIMDE_X86_AVX512_DPWSSD_H diff --git a/thirdparty/simde/x86/avx512/dpwssds.h b/thirdparty/simde/x86/avx512/dpwssds.h index ea720917f..becd44b93 100644 --- a/thirdparty/simde/x86/avx512/dpwssds.h +++ b/thirdparty/simde/x86/avx512/dpwssds.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_DPWSSDS_H) #define SIMDE_X86_AVX512_DPWSSDS_H diff --git a/thirdparty/simde/x86/avx512/expand.h b/thirdparty/simde/x86/avx512/expand.h index 4afba87f3..d0c76ebe4 100644 --- a/thirdparty/simde/x86/avx512/expand.h +++ b/thirdparty/simde/x86/avx512/expand.h @@ -23,23 +23,407 @@ * Copyright: * 2021 Andrew Rodriguez * 2021 Evan Nemerson + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_EXPAND_H) #define SIMDE_X86_AVX512_EXPAND_H #include "types.h" -#include "mov.h" -#include "mov_mask.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_expand_epi8(simde__m128i src, simde__mmask16 k, simde__m128i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_expand_epi8(src, k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + src_ = simde__m128i_to_private(src); + simde__m128i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i8[i] = a_.i8[src_idx++]; + } else { + r_.i8[i] = src_.i8[i]; + } + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_expand_epi8 + #define _mm_mask_expand_epi8(src, k, a) simde_mm_mask_expand_epi8((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_expand_epi8(simde__mmask16 k, simde__m128i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_expand_epi8(k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i8[i] = a_.i8[src_idx++]; + } else { + r_.i8[i] = INT8_C(0); + } + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_expand_epi8 + #define _mm_maskz_expand_epi8(k, a) simde_mm_maskz_expand_epi8((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_expand_epi8(simde__m256i src, simde__mmask32 k, simde__m256i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_expand_epi8(src, k, a); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + src_ = simde__m256i_to_private(src); + simde__m256i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i8[i] = a_.i8[src_idx++]; + } else { + r_.i8[i] = src_.i8[i]; + } + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_expand_epi8 + #define _mm256_mask_expand_epi8(src, k, a) simde_mm256_mask_expand_epi8((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_expand_epi8(simde__mmask32 k, simde__m256i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_expand_epi8(k, a); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i8[i] = a_.i8[src_idx++]; + } else { + r_.i8[i] = INT8_C(0); + } + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_expand_epi8 + #define _mm256_maskz_expand_epi8(k, a) simde_mm256_maskz_expand_epi8((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_expand_epi8(simde__m512i src, simde__mmask64 k, simde__m512i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) + return _mm512_mask_expand_epi8(src, k, a); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + src_ = simde__m512i_to_private(src); + simde__m512i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i8[i] = a_.i8[src_idx++]; + } else { + r_.i8[i] = src_.i8[i]; + } + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_expand_epi8 + #define _mm512_mask_expand_epi8(src, k, a) simde_mm512_mask_expand_epi8((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_expand_epi8(simde__mmask64 k, simde__m512i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) + return _mm512_maskz_expand_epi8(k, a); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i8[i] = a_.i8[src_idx++]; + } else { + r_.i8[i] = INT8_C(0); + } + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_expand_epi8 + #define _mm512_maskz_expand_epi8(k, a) simde_mm512_maskz_expand_epi8((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_expand_epi16(simde__m128i src, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_expand_epi16(src, k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + src_ = simde__m128i_to_private(src); + simde__m128i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i16[i] = a_.i16[src_idx++]; + } else { + r_.i16[i] = src_.i16[i]; + } + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_expand_epi16 + #define _mm_mask_expand_epi16(src, k, a) simde_mm_mask_expand_epi16((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_expand_epi16(simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_expand_epi16(k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i16[i] = a_.i16[src_idx++]; + } else { + r_.i16[i] = INT16_C(0); + } + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_expand_epi16 + #define _mm_maskz_expand_epi16(k, a) simde_mm_maskz_expand_epi16((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_expand_epi16(simde__m256i src, simde__mmask16 k, simde__m256i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_expand_epi16(src, k, a); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + src_ = simde__m256i_to_private(src); + simde__m256i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i16[i] = a_.i16[src_idx++]; + } else { + r_.i16[i] = src_.i16[i]; + } + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_expand_epi16 + #define _mm256_mask_expand_epi16(src, k, a) simde_mm256_mask_expand_epi16((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_expand_epi16(simde__mmask16 k, simde__m256i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_expand_epi16(k, a); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i16[i] = a_.i16[src_idx++]; + } else { + r_.i16[i] = INT16_C(0); + } + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_expand_epi16 + #define _mm256_maskz_expand_epi16(k, a) simde_mm256_maskz_expand_epi16((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_expand_epi16(simde__m512i src, simde__mmask32 k, simde__m512i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) + return _mm512_mask_expand_epi16(src, k, a); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + src_ = simde__m512i_to_private(src); + simde__m512i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i16[i] = a_.i16[src_idx++]; + } else { + r_.i16[i] = src_.i16[i]; + } + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_expand_epi16 + #define _mm512_mask_expand_epi16(src, k, a) simde_mm512_mask_expand_epi16((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_expand_epi16(simde__mmask32 k, simde__m512i a) { + #if defined(SIMDE_X864_AVX512VBMI2_NATIVE) + return _mm512_maskz_expand_epi16(k, a); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i16[i] = a_.i16[src_idx++]; + } else { + r_.i16[i] = INT16_C(0); + } + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_expand_epi16 + #define _mm512_maskz_expand_epi16(k, a) simde_mm512_maskz_expand_epi16((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_expand_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X864_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_expand_epi32(src, k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + src_ = simde__m128i_to_private(src); + simde__m128i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i32[i] = a_.i32[src_idx++]; + } else { + r_.i32[i] = src_.i32[i]; + } + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_expand_epi32 + #define _mm_mask_expand_epi32(src, k, a) simde_mm_mask_expand_epi32((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_expand_epi32(simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X864_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_expand_epi32(k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i32[i] = a_.i32[src_idx++]; + } else { + r_.i32[i] = INT32_C(0); + } + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_expand_epi32 + #define _mm_maskz_expand_epi32(k, a) simde_mm_maskz_expand_epi32((k), (a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_expand_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X864_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_expand_epi32(src, k, a); #else simde__m256i_private @@ -59,7 +443,7 @@ simde_mm256_mask_expand_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a) return simde__m256i_from_private(r_); #endif } -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_expand_epi32 #define _mm256_mask_expand_epi32(src, k, a) simde_mm256_mask_expand_epi32((src), (k), (a)) #endif @@ -67,7 +451,7 @@ simde_mm256_mask_expand_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a) SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_maskz_expand_epi32(simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) + #if defined(SIMDE_X864_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_maskz_expand_epi32(k, a); #else simde__m256i_private @@ -86,11 +470,562 @@ simde_mm256_maskz_expand_epi32(simde__mmask8 k, simde__m256i a) { return simde__m256i_from_private(r_); #endif } -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_expand_epi32 #define _mm256_maskz_expand_epi32(k, a) simde_mm256_maskz_expand_epi32((k), (a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_expand_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X864_AVX512F_NATIVE) + return _mm512_mask_expand_epi32(src, k, a); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + src_ = simde__m512i_to_private(src); + simde__m512i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i32[i] = a_.i32[src_idx++]; + } else { + r_.i32[i] = src_.i32[i]; + } + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_expand_epi32 + #define _mm512_mask_expand_epi32(src, k, a) simde_mm512_mask_expand_epi32((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_expand_epi32(simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X864_AVX512F_NATIVE) + return _mm512_maskz_expand_epi32(k, a); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i32[i] = a_.i32[src_idx++]; + } else { + r_.i32[i] = INT32_C(0); + } + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_expand_epi32 + #define _mm512_maskz_expand_epi32(k, a) simde_mm512_maskz_expand_epi32((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_expand_epi64(simde__m128i src, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_expand_epi64(src, k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + src_ = simde__m128i_to_private(src); + simde__m128i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i64[i] = a_.i64[src_idx++]; + } else { + r_.i64[i] = src_.i64[i]; + } + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_expand_epi64 + #define _mm_mask_expand_epi64(src, k, a) simde_mm_mask_expand_epi64((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_expand_epi64(simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_expand_epi64(k, a); + #else + simde__m128i_private + a_ = simde__m128i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i64[i] = a_.i64[src_idx++]; + } else { + r_.i64[i] = INT64_C(0); + } + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_expand_epi64 + #define _mm_maskz_expand_epi64(k, a) simde_mm_maskz_expand_epi64((k), (a)) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_expand_epi64(simde__m256i src, simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_expand_epi64(src, k, a); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + src_ = simde__m256i_to_private(src); + simde__m256i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i64[i] = a_.i64[src_idx++]; + } else { + r_.i64[i] = src_.i64[i]; + } + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_expand_epi64 + #define _mm256_mask_expand_epi64(src, k, a) simde_mm256_mask_expand_epi64((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_expand_epi64(simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_expand_epi64(k, a); + #else + simde__m256i_private + a_ = simde__m256i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i64[i] = a_.i64[src_idx++]; + } else { + r_.i64[i] = INT64_C(0); + } + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_expand_epi64 + #define _mm256_maskz_expand_epi64(k, a) simde_mm256_maskz_expand_epi64((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_expand_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_expand_epi64(src, k, a); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + src_ = simde__m512i_to_private(src); + simde__m512i_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i64[i] = a_.i64[src_idx++]; + } else { + r_.i64[i] = src_.i64[i]; + } + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_expand_epi64 + #define _mm512_mask_expand_epi64(src, k, a) simde_mm512_mask_expand_epi64((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_expand_epi64(simde__mmask8 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_expand_epi64(k, a); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.i64[i] = a_.i64[src_idx++]; + } else { + r_.i64[i] = INT64_C(0); + } + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_expand_epi64 + #define _mm512_maskz_expand_epi64(k, a) simde_mm512_maskz_expand_epi64((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_mask_expand_ps(simde__m128 src, simde__mmask8 k, simde__m128 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_expand_ps(src, k, a); + #else + simde__m128_private + a_ = simde__m128_to_private(a), + src_ = simde__m128_to_private(src); + simde__m128_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f32[i] = a_.f32[src_idx++]; + } else { + r_.f32[i] = src_.f32[i]; + } + } + + return simde__m128_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_expand_ps + #define _mm_mask_expand_ps(src, k, a) simde_mm_mask_expand_ps((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_maskz_expand_ps(simde__mmask8 k, simde__m128 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_expand_ps(k, a); + #else + simde__m128_private + a_ = simde__m128_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f32[i] = a_.f32[src_idx++]; + } else { + r_.f32[i] = SIMDE_FLOAT32_C(0.0); + } + } + + return simde__m128_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_expand_ps + #define _mm_maskz_expand_ps(k, a) simde_mm_maskz_expand_ps((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_mask_expand_ps(simde__m256 src, simde__mmask8 k, simde__m256 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_expand_ps(src, k, a); + #else + simde__m256_private + a_ = simde__m256_to_private(a), + src_ = simde__m256_to_private(src); + simde__m256_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f32[i] = a_.f32[src_idx++]; + } else { + r_.f32[i] = src_.f32[i]; + } + } + + return simde__m256_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_expand_ps + #define _mm256_mask_expand_ps(src, k, a) simde_mm256_mask_expand_ps((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_maskz_expand_ps(simde__mmask8 k, simde__m256 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_expand_ps(k, a); + #else + simde__m256_private + a_ = simde__m256_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f32[i] = a_.f32[src_idx++]; + } else { + r_.f32[i] = SIMDE_FLOAT32_C(0.0); + } + } + + return simde__m256_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_expand_ps + #define _mm256_maskz_expand_ps(k, a) simde_mm256_maskz_expand_ps((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_mask_expand_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_expand_ps(src, k, a); + #else + simde__m512_private + a_ = simde__m512_to_private(a), + src_ = simde__m512_to_private(src); + simde__m512_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f32[i] = a_.f32[src_idx++]; + } else { + r_.f32[i] = src_.f32[i]; + } + } + + return simde__m512_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_expand_ps + #define _mm512_mask_expand_ps(src, k, a) simde_mm512_mask_expand_ps((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_maskz_expand_ps(simde__mmask16 k, simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_expand_ps(k, a); + #else + simde__m512_private + a_ = simde__m512_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f32[i] = a_.f32[src_idx++]; + } else { + r_.f32[i] = SIMDE_FLOAT32_C(0.0); + } + } + + return simde__m512_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_expand_ps + #define _mm512_maskz_expand_ps(k, a) simde_mm512_maskz_expand_ps((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_mask_expand_pd(simde__m128d src, simde__mmask8 k, simde__m128d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_expand_pd(src, k, a); + #else + simde__m128d_private + a_ = simde__m128d_to_private(a), + src_ = simde__m128d_to_private(src); + simde__m128d_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f64[i] = a_.f64[src_idx++]; + } else { + r_.f64[i] = src_.f64[i]; + } + } + + return simde__m128d_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_expand_pd + #define _mm_mask_expand_pd(src, k, a) simde_mm_mask_expand_pd((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_maskz_expand_pd(simde__mmask8 k, simde__m128d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_expand_pd(k, a); + #else + simde__m128d_private + a_ = simde__m128d_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f64[i] = a_.f64[src_idx++]; + } else { + r_.f64[i] = SIMDE_FLOAT64_C(0.0); + } + } + + return simde__m128d_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_expand_pd + #define _mm_maskz_expand_pd(k, a) simde_mm_maskz_expand_pd((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256d +simde_mm256_mask_expand_pd(simde__m256d src, simde__mmask8 k, simde__m256d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_expand_pd(src, k, a); + #else + simde__m256d_private + a_ = simde__m256d_to_private(a), + src_ = simde__m256d_to_private(src); + simde__m256d_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f64[i] = a_.f64[src_idx++]; + } else { + r_.f64[i] = src_.f64[i]; + } + } + + return simde__m256d_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_expand_pd + #define _mm256_mask_expand_pd(src, k, a) simde_mm256_mask_expand_pd((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256d +simde_mm256_maskz_expand_pd(simde__mmask8 k, simde__m256d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_expand_pd(k, a); + #else + simde__m256d_private + a_ = simde__m256d_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f64[i] = a_.f64[src_idx++]; + } else { + r_.f64[i] = SIMDE_FLOAT64_C(0.0); + } + } + + return simde__m256d_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_expand_pd + #define _mm256_maskz_expand_pd(k, a) simde_mm256_maskz_expand_pd((k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_mask_expand_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_expand_pd(src, k, a); + #else + simde__m512d_private + a_ = simde__m512d_to_private(a), + src_ = simde__m512d_to_private(src); + simde__m512d_private r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f64[i] = a_.f64[src_idx++]; + } else { + r_.f64[i] = src_.f64[i]; + } + } + + return simde__m512d_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_expand_pd + #define _mm512_mask_expand_pd(src, k, a) simde_mm512_mask_expand_pd((src), (k), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_maskz_expand_pd(simde__mmask8 k, simde__m512d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_expand_pd(k, a); + #else + simde__m512d_private + a_ = simde__m512d_to_private(a), + r_; + + size_t src_idx = 0; + for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + if (k & (UINT64_C(1) << i)) { + r_.f64[i] = a_.f64[src_idx++]; + } else { + r_.f64[i] = SIMDE_FLOAT64_C(0.0); + } + } + + return simde__m512d_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_expand_pd + #define _mm512_maskz_expand_pd(k, a) simde_mm512_maskz_expand_pd((k), (a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/x86/avx512/fixupimm.h b/thirdparty/simde/x86/avx512/fixupimm.h index 2ea234bd9..015cdea7f 100644 --- a/thirdparty/simde/x86/avx512/fixupimm.h +++ b/thirdparty/simde/x86/avx512/fixupimm.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2026 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_FIXUPIMM_H) #define SIMDE_X86_AVX512_FIXUPIMM_H @@ -374,7 +401,14 @@ simde_mm_fixupimm_ss (simde__m128 a, simde__m128 b, simde__m128i c, int imm8) switch (((c_.i32[0] >> (select << 2)) & 15)) { case 0: - b_.f32[0] = a_.f32[0]; + #if defined(SIMDE_BUG_GCC_121064) + { + simde_float32 tmp = a_.f32[0]; + simde_memcpy(&b_.f32[0], &tmp, sizeof(tmp)); + } + #else + b_.f32[0] = a_.f32[0]; + #endif break; case 2: b_.f32[0] = SIMDE_MATH_NANF; @@ -430,7 +464,7 @@ simde_mm_fixupimm_ss (simde__m128 a, simde__m128 b, simde__m128i c, int imm8) #define _mm_fixupimm_ss(a, b, c, imm8) simde_mm_fixupimm_ss(a, b, c, imm8) #endif -#if defined(SIMDE_X86_AVX512F_NATIVE) +#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_CLANG_179057) #define simde_mm_mask_fixupimm_ss(a, k, b, c, imm8) _mm_mask_fixupimm_ss(a, k, b, c, imm8) #else #define simde_mm_mask_fixupimm_ss(a, k, b, c, imm8) simde_mm_mask_mov_ps(a, ((k) | 14), simde_mm_fixupimm_ss(a, b, c, imm8)) @@ -440,7 +474,7 @@ simde_mm_fixupimm_ss (simde__m128 a, simde__m128 b, simde__m128i c, int imm8) #define _mm_mask_fixupimm_ss(a, k, b, c, imm8) simde_mm_mask_fixupimm_ss(a, k, b, c, imm8) #endif -#if defined(SIMDE_X86_AVX512F_NATIVE) +#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_CLANG_179057) #define simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8) _mm_maskz_fixupimm_ss(k, a, b, c, imm8) #else #define simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8) simde_mm_maskz_mov_ps(((k) | 14), simde_mm_fixupimm_ss(a, b, c, imm8)) @@ -874,7 +908,7 @@ simde_mm_fixupimm_sd (simde__m128d a, simde__m128d b, simde__m128i c, int imm8) #define _mm_fixupimm_sd(a, b, c, imm8) simde_mm_fixupimm_sd(a, b, c, imm8) #endif -#if defined(SIMDE_X86_AVX512F_NATIVE) +#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_CLANG_179057) #define simde_mm_mask_fixupimm_sd(a, k, b, c, imm8) _mm_mask_fixupimm_sd(a, k, b, c, imm8) #else #define simde_mm_mask_fixupimm_sd(a, k, b, c, imm8) simde_mm_mask_mov_pd(a, ((k) | 2), simde_mm_fixupimm_sd(a, b, c, imm8)) @@ -884,7 +918,7 @@ simde_mm_fixupimm_sd (simde__m128d a, simde__m128d b, simde__m128i c, int imm8) #define _mm_mask_fixupimm_sd(a, k, b, c, imm8) simde_mm_mask_fixupimm_sd(a, k, b, c, imm8) #endif -#if defined(SIMDE_X86_AVX512F_NATIVE) +#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_CLANG_179057) #define simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8) _mm_maskz_fixupimm_sd(k, a, b, c, imm8) #else #define simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8) simde_mm_maskz_mov_pd(((k) | 2), simde_mm_fixupimm_sd(a, b, c, imm8)) diff --git a/thirdparty/simde/x86/avx512/fixupimm_round.h b/thirdparty/simde/x86/avx512/fixupimm_round.h index 636b82a84..8f35a6261 100644 --- a/thirdparty/simde/x86/avx512/fixupimm_round.h +++ b/thirdparty/simde/x86/avx512/fixupimm_round.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2026 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_FIXUPIMM_ROUND_H) #define SIMDE_X86_AVX512_FIXUPIMM_ROUND_H @@ -401,7 +428,7 @@ SIMDE_BEGIN_DECLS_ #define _mm_fixupimm_round_ss(a, b, c, imm8, sae) simde_mm_fixupimm_round_ss(a, b, c, imm8, sae) #endif -#if defined(SIMDE_X86_AVX512F_NATIVE) +#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_CLANG_179057) #define simde_mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) _mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) #elif defined(SIMDE_FAST_EXCEPTIONS) #define simde_mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_ss(a, k, b, c, imm8) @@ -457,7 +484,7 @@ SIMDE_BEGIN_DECLS_ #define _mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) #endif -#if defined(SIMDE_X86_AVX512F_NATIVE) +#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_CLANG_179057) #define simde_mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) _mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) #elif defined(SIMDE_FAST_EXCEPTIONS) #define simde_mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8) @@ -569,7 +596,7 @@ SIMDE_BEGIN_DECLS_ #define _mm_fixupimm_round_sd(a, b, c, imm8, sae) simde_mm_fixupimm_round_sd(a, b, c, imm8, sae) #endif -#if defined(SIMDE_X86_AVX512F_NATIVE) +#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_CLANG_179057) #define simde_mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) _mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) #elif defined(SIMDE_FAST_EXCEPTIONS) #define simde_mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_sd(a, k, b, c, imm8) @@ -625,7 +652,7 @@ SIMDE_BEGIN_DECLS_ #define _mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) #endif -#if defined(SIMDE_X86_AVX512F_NATIVE) +#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_CLANG_179057) #define simde_mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) _mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) #elif defined(SIMDE_FAST_EXCEPTIONS) #define simde_mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8) diff --git a/thirdparty/simde/x86/avx512/flushsubnormal.h b/thirdparty/simde/x86/avx512/flushsubnormal.h index 6830e7c69..cb94f9662 100644 --- a/thirdparty/simde/x86/avx512/flushsubnormal.h +++ b/thirdparty/simde/x86/avx512/flushsubnormal.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_FLUSHSUBNORMAL_H) #define SIMDE_X86_AVX512_FLUSHSUBNORMAL_H diff --git a/thirdparty/simde/x86/avx512/fmaddsub.h b/thirdparty/simde/x86/avx512/fmaddsub.h index f1139e4d6..478f98909 100644 --- a/thirdparty/simde/x86/avx512/fmaddsub.h +++ b/thirdparty/simde/x86/avx512/fmaddsub.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2024 Robin Christ + */ + #if !defined(SIMDE_X86_AVX512_FMADDSUB_H) #define SIMDE_X86_AVX512_FMADDSUB_H diff --git a/thirdparty/simde/x86/avx512/gather.h b/thirdparty/simde/x86/avx512/gather.h index 8dec2ee0a..5e8d63b80 100644 --- a/thirdparty/simde/x86/avx512/gather.h +++ b/thirdparty/simde/x86/avx512/gather.h @@ -30,6 +30,7 @@ #include "types.h" #include "../avx2.h" #include "extract.h" +#include "setzero.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS diff --git a/thirdparty/simde/x86/avx512/insert.h b/thirdparty/simde/x86/avx512/insert.h index 67120d31c..ffd1c329f 100644 --- a/thirdparty/simde/x86/avx512/insert.h +++ b/thirdparty/simde/x86/avx512/insert.h @@ -30,11 +30,99 @@ #include "types.h" #include "mov.h" +#include "setzero.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_insertf32x4 (simde__m256 a, simde__m128 b, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + simde__m256 r; + switch(imm8) { + case 0: r = _mm256_insertf32x4(a, b, 0); break; + case 1: r = _mm256_insertf32x4(a, b, 1); break; + default: HEDLEY_UNREACHABLE(); r = simde_mm256_setzero_ps(); break; + } + return r; + #else + simde__m256_private a_ = simde__m256_to_private(a); + + a_.m128[imm8 & 1] = b; + + return simde__m256_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_insertf32x4 + #define _mm256_insertf32x4(a, b, imm8) simde_mm256_insertf32x4(a, b, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256d +simde_mm256_insertf64x2 (simde__m256d a, simde__m128d b, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + simde__m256d r; + switch(imm8) { + case 0: r = _mm256_insertf64x2(a, b, 0); break; + case 1: r = _mm256_insertf64x2(a, b, 1); break; + default: HEDLEY_UNREACHABLE(); r = simde_mm256_setzero_pd(); break; + } + return r; + #else + simde__m256d_private a_ = simde__m256d_to_private(a); + + a_.m128d[imm8 & 1] = b; + + return simde__m256d_from_private(a_); + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_insertf64x2 + #define _mm256_insertf64x2(a, b, imm8) simde_mm256_insertf64x2(a, b, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_inserti32x4 (simde__m256i a, simde__m128i b, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + simde__m256i_private a_ = simde__m256i_to_private(a); + + a_.m128i[imm8 & 1] = b; + + return simde__m256i_from_private(a_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_inserti32x4(a, b, imm8) _mm256_inserti32x4(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_inserti32x4 + #define _mm256_inserti32x4(a, b, imm8) simde_mm256_inserti32x4(a, b, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_inserti64x2 (simde__m256i a, simde__m128i b, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + simde__m256i_private a_ = simde__m256i_to_private(a); + + a_.m128i[imm8 & 1] = b; + + return simde__m256i_from_private(a_); +} +#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_inserti64x2(a, b, imm8) _mm256_inserti64x2(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_inserti64x2 + #define _mm256_inserti64x2(a, b, imm8) simde_mm256_inserti64x2(a, b, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_insertf32x4 (simde__m512 a, simde__m128 b, int imm8) diff --git a/thirdparty/simde/x86/avx512/knot.h b/thirdparty/simde/x86/avx512/knot.h index 3b4696e8b..511a1c693 100644 --- a/thirdparty/simde/x86/avx512/knot.h +++ b/thirdparty/simde/x86/avx512/knot.h @@ -41,7 +41,7 @@ simde_knot_mask8 (simde__mmask8 a) { && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) return _knot_mask8(a); #else - return ~a; + return HEDLEY_STATIC_CAST(simde__mmask8, ~a); #endif } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) @@ -57,7 +57,7 @@ simde_knot_mask16 (simde__mmask16 a) { && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) return _knot_mask16(a); #else - return ~a; + return HEDLEY_STATIC_CAST(simde__mmask16, ~a); #endif } #define simde_mm512_knot(a) simde_knot_mask16(a) diff --git a/thirdparty/simde/x86/avx512/load.h b/thirdparty/simde/x86/avx512/load.h index 6a4af937d..2a209d3f9 100644 --- a/thirdparty/simde/x86/avx512/load.h +++ b/thirdparty/simde/x86/avx512/load.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_LOAD_H) @@ -33,6 +34,38 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128h +simde_mm_load_ph (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_load_ph(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128h)); + #else + simde__m128h r; + simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128h), sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_load_ph + #define _mm_load_ph(a) simde_mm_load_ph(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256h +simde_mm256_load_ph (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_load_ph(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256h)); + #else + simde__m256h r; + simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256h), sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_load_ph + #define _mm256_load_ph(a) simde_mm256_loadu_ph(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_load_pd (void const * mem_addr) { diff --git a/thirdparty/simde/x86/avx512/loadu.h b/thirdparty/simde/x86/avx512/loadu.h index 4a31966b4..0a9f7b9b1 100644 --- a/thirdparty/simde/x86/avx512/loadu.h +++ b/thirdparty/simde/x86/avx512/loadu.h @@ -22,17 +22,54 @@ * * Copyright: * 2020 Evan Nemerson + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_LOADU_H) #define SIMDE_X86_AVX512_LOADU_H #include "types.h" +#include "mov.h" +#include "../avx.h" +#include "../sse2.h" +#include "../sse.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128h +simde_mm_loadu_ph (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_loadu_ph(mem_addr); + #else + simde__m128h r; + simde_memcpy(&r, mem_addr, sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_loadu_ph + #define _mm_loadu_ph(a) simde_mm_loadu_ph(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256h +simde_mm256_loadu_ph (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_loadu_ph(mem_addr); + #else + simde__m256h r; + simde_memcpy(&r, mem_addr, sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_loadu_ph + #define _mm256_loadu_ph(a) simde_mm256_loadu_ph(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_loadu_ps (void const * mem_addr) { @@ -136,10 +173,196 @@ simde_mm512_loadu_si512 (void const * mem_addr) { #define _mm512_loadu_epi32(a) simde_mm512_loadu_epi32(a) #define _mm512_loadu_epi64(a) simde_mm512_loadu_epi64(a) #endif +#define simde_x_mm512_loadu_epu8(mem_addr) simde_mm512_loadu_si512(mem_addr) +#define simde_x_mm512_loadu_epu16(mem_addr) simde_mm512_loadu_si512(mem_addr) +#define simde_x_mm512_loadu_epu32(mem_addr) simde_mm512_loadu_si512(mem_addr) +#define simde_x_mm512_loadu_epu64(mem_addr) simde_mm512_loadu_si512(mem_addr) + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_loadu_epi8(simde__m128i src, simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_loadu_epi8(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_mask_mov_epi8(src, k, simde_mm_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_loadu_epi8 + #define _mm_mask_loadu_epi8(src, k, mem_addr) simde_mm_mask_loadu_epi8(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_loadu_epi8(simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_loadu_epi8(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_maskz_mov_epi8(k, simde_mm_loadu_epi8(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_loadu_epi8 + #define _mm_maskz_loadu_epi8(k, mem_addr) simde_mm_maskz_loadu_epi8(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_loadu_epi16(simde__m128i src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_loadu_epi16(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_mask_mov_epi16(src, k, simde_mm_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_loadu_epi16 + #define _mm_mask_loadu_epi16(src, k, mem_addr) simde_mm_mask_loadu_epi16(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_loadu_epi16(simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_loadu_epi16(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_maskz_mov_epi16(k, simde_mm_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_loadu_epi16 + #define _mm_maskz_loadu_epi16(k, mem_addr) simde_mm_maskz_loadu_epi16(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_loadu_epi32(simde__m128i src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_loadu_epi32(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_mask_mov_epi32(src, k, simde_mm_loadu_epi32(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_loadu_epi32 + #define _mm_mask_loadu_epi32(src, k, mem_addr) simde_mm_mask_loadu_epi32(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_loadu_epi32(simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_loadu_epi32(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_maskz_mov_epi32(k, simde_mm_loadu_epi32(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_loadu_epi32 + #define _mm_maskz_loadu_epi32(k, mem_addr) simde_mm_maskz_loadu_epi32(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_loadu_epi64(simde__m128i src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_loadu_epi64(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_mask_mov_epi64(src, k, simde_mm_loadu_epi64(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_loadu_epi64 + #define _mm_mask_loadu_epi64(src, k, mem_addr) simde_mm_mask_loadu_epi64(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_loadu_epi64(simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_loadu_epi64(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_maskz_mov_epi64(k, simde_mm_loadu_epi64(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_loadu_epi64 + #define _mm_maskz_loadu_epi64(k, mem_addr) simde_mm_maskz_loadu_epi64(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_mask_loadu_ps(simde__m128 src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_loadu_ps(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_mask_mov_ps(src, k, simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(const float*, mem_addr))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_loadu_ps + #define _mm_mask_loadu_ps(src, k, mem_addr) simde_mm_mask_loadu_ps(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm_maskz_loadu_ps(simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_loadu_ps(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_maskz_mov_ps(k, simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(const float*, mem_addr))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_loadu_ps + #define _mm_maskz_loadu_ps(k, mem_addr) simde_mm_maskz_loadu_ps(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_mask_loadu_pd(simde__m128d src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_loadu_pd(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_mask_mov_pd(src, k, simde_mm_loadu_pd(HEDLEY_REINTERPRET_CAST(const double*, mem_addr))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_loadu_pd + #define _mm_mask_loadu_pd(src, k, mem_addr) simde_mm_mask_loadu_pd(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_mm_maskz_loadu_pd(simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_loadu_pd(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm_maskz_mov_pd(k, simde_mm_loadu_pd(HEDLEY_REINTERPRET_CAST(const double*, mem_addr))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_loadu_pd + #define _mm_maskz_loadu_pd(k, mem_addr) simde_mm_maskz_loadu_pd(k, mem_addr) +#endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i -simde_mm256_maskz_loadu_epi16 (simde__mmask16 k, void const * mem_addr) { +simde_mm256_mask_loadu_epi16(simde__m256i src, simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_loadu_epi16(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_mask_mov_epi16(src, k, simde_mm256_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_loadu_epi16 + #define _mm256_mask_loadu_epi16(src, k, mem_addr) simde_mm256_mask_loadu_epi16(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_loadu_epi16(simde__mmask16 k, void const * mem_addr) { #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_maskz_loadu_epi16(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); #else @@ -151,6 +374,104 @@ simde_mm256_maskz_loadu_epi16 (simde__mmask16 k, void const * mem_addr) { #define _mm256_maskz_loadu_epi16(k, mem_addr) simde_mm256_maskz_loadu_epi16(k, mem_addr) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_loadu_epi32(simde__m256i src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_loadu_epi32(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_mask_mov_epi32(src, k, simde_mm256_loadu_epi32(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_loadu_epi32 + #define _mm256_mask_loadu_epi32(src, k, mem_addr) simde_mm256_mask_loadu_epi32(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_loadu_epi32(simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_loadu_epi32(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_maskz_mov_epi32(k, simde_mm256_loadu_epi32(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_loadu_epi32 + #define _mm256_maskz_loadu_epi32(k, mem_addr) simde_mm256_maskz_loadu_epi32(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_loadu_epi64(simde__m256i src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_loadu_epi64(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_mask_mov_epi64(src, k, simde_mm256_loadu_epi64(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_loadu_epi64 + #define _mm256_mask_loadu_epi64(src, k, mem_addr) simde_mm256_mask_loadu_epi64(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_loadu_epi64(simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_loadu_epi64(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_maskz_mov_epi64(k, simde_mm256_loadu_epi64(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_loadu_epi64 + #define _mm256_maskz_loadu_epi64(k, mem_addr) simde_mm256_maskz_loadu_epi64(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256d +simde_mm256_mask_loadu_pd (simde__m256d src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_loadu_pd(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_mask_mov_pd(src, k, simde_mm256_loadu_pd(HEDLEY_REINTERPRET_CAST(const double*, mem_addr))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_loadu_pd + #define _mm256_mask_loadu_pd(src, k, mem_addr) simde_mm256_mask_loadu_pd(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256d +simde_mm256_maskz_loadu_pd (simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_loadu_pd(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_maskz_mov_pd(k, simde_mm256_loadu_pd(HEDLEY_REINTERPRET_CAST(const double*, mem_addr))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_loadu_pd + #define _mm256_maskz_loadu_pd(k, mem_addr) simde_mm256_maskz_loadu_pd(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_mask_loadu_ps (simde__m256 src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_mask_loadu_ps(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_mask_mov_ps(src, k, simde_mm256_loadu_ps(HEDLEY_REINTERPRET_CAST(const float*, mem_addr))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_loadu_ps + #define _mm256_mask_loadu_ps(src, k, mem_addr) simde_mm256_mask_loadu_ps(src, k, mem_addr) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256 simde_mm256_maskz_loadu_ps (simde__mmask8 k, void const * mem_addr) { @@ -165,6 +486,34 @@ simde_mm256_maskz_loadu_ps (simde__mmask8 k, void const * mem_addr) { #define _mm256_maskz_loadu_ps(k, mem_addr) simde_mm256_maskz_loadu_ps(k, mem_addr) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_loadu_epi8 (simde__m512i src, simde__mmask64 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_loadu_epi8(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_epi8(src, k, simde_mm512_loadu_epi8(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_epi8 + #define _mm512_mask_loadu_epi8(src, k, mem_addr) simde_mm512_mask_loadu_epi8(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_loadu_epi8 (simde__mmask64 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_maskz_loadu_epi8(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_epi8(k, simde_mm512_loadu_epi8(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_epi8 + #define _mm512_maskz_loadu_epi8(k, mem_addr) simde_mm512_maskz_loadu_epi8(k, mem_addr) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_loadu_epi16 (simde__m512i src, simde__mmask32 k, void const * mem_addr) { @@ -207,6 +556,21 @@ simde_mm512_mask_loadu_epi32 (simde__m512i src, simde__mmask16 k, void const * m #define _mm512_mask_loadu_epi32(src, k, mem_addr) simde_mm512_mask_loadu_epi32(src, k, mem_addr) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_loadu_epi32 (simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_loadu_epi32(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_epi32(k, simde_mm512_loadu_epi32(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_epi32 + #define _mm512_maskz_loadu_epi32(k, mem_addr) simde_mm512_maskz_loadu_epi32(k, mem_addr) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_loadu_epi64 (simde__m512i src, simde__mmask8 k, void const * mem_addr) { diff --git a/thirdparty/simde/x86/avx512/lzcnt.h b/thirdparty/simde/x86/avx512/lzcnt.h index 41a0eecbd..27af9b662 100644 --- a/thirdparty/simde/x86/avx512/lzcnt.h +++ b/thirdparty/simde/x86/avx512/lzcnt.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_LZCNT_H) @@ -195,7 +196,7 @@ simde_mm_mask_lzcnt_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a) { return simde_mm_mask_mov_epi32(src, k, simde_mm_lzcnt_epi32(a)); #endif } -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) #undef _mm_mask_lzcnt_epi32 #define _mm_mask_lzcnt_epi32(src, k, a) simde_mm_mask_lzcnt_epi32(src, k, a) #endif @@ -209,11 +210,266 @@ simde_mm_maskz_lzcnt_epi32(simde__mmask8 k, simde__m128i a) { return simde_mm_maskz_mov_epi32(k, simde_mm_lzcnt_epi32(a)); #endif } -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_lzcnt_epi32 #define _mm_maskz_lzcnt_epi32(k, a) simde_mm_maskz_lzcnt_epi32(k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_lzcnt_epi32(simde__m256i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm256_lzcnt_epi32(a); + #else + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])); i++) { + r_.m128i[i] = simde_mm_lzcnt_epi32(a_.m128i[i]); + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm256_lzcnt_epi32 + #define _mm256_lzcnt_epi32(a) simde_mm256_lzcnt_epi32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_lzcnt_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm256_mask_lzcnt_epi32(src, k, a); + #else + return simde_mm256_mask_mov_epi32(src, k, simde_mm256_lzcnt_epi32(a)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_lzcnt_epi32 + #define _mm256_mask_lzcnt_epi32(src, k, a) simde_mm256_mask_lzcnt_epi32(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_lzcnt_epi32(simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm256_maskz_lzcnt_epi32(k, a); + #else + return simde_mm256_maskz_mov_epi32(k, simde_mm256_lzcnt_epi32(a)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_lzcnt_epi32 + #define _mm256_maskz_lzcnt_epi32(k, a) simde_mm256_maskz_lzcnt_epi32(k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_lzcnt_epi32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm512_lzcnt_epi32(a); + #else + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])); i++) { + r_.m128i[i] = simde_mm_lzcnt_epi32(a_.m128i[i]); + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm512_lzcnt_epi32 + #define _mm512_lzcnt_epi32(a) simde_mm512_lzcnt_epi32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_lzcnt_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm512_mask_lzcnt_epi32(src, k, a); + #else + return simde_mm512_mask_mov_epi32(src, k, simde_mm512_lzcnt_epi32(a)); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_lzcnt_epi32 + #define _mm512_mask_lzcnt_epi32(src, k, a) simde_mm512_mask_lzcnt_epi32(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_lzcnt_epi32(simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm512_maskz_lzcnt_epi32(k, a); + #else + return simde_mm512_maskz_mov_epi32(k, simde_mm512_lzcnt_epi32(a)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_lzcnt_epi32 + #define _mm512_maskz_lzcnt_epi32(k, a) simde_mm512_maskz_lzcnt_epi32(k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_lzcnt_epi64(simde__m128i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm_lzcnt_epi64(a); + #else + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { + r_.i64[i] = (HEDLEY_UNLIKELY(a_.i64[i] == 0) ? HEDLEY_STATIC_CAST(int64_t, sizeof(int64_t) * CHAR_BIT) : HEDLEY_STATIC_CAST(int64_t, simde_x_clz64(HEDLEY_STATIC_CAST(uint64_t, a_.i64[i])))); + } + + return simde__m128i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm_lzcnt_epi64 + #define _mm_lzcnt_epi64(a) simde_mm_lzcnt_epi64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_lzcnt_epi64(simde__m128i src, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm_mask_lzcnt_epi64(src, k, a); + #else + return simde_mm_mask_mov_epi64(src, k, simde_mm_lzcnt_epi64(a)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_lzcnt_epi64 + #define _mm_mask_lzcnt_epi64(src, k, a) simde_mm_mask_lzcnt_epi64(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_lzcnt_epi64(simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm_maskz_lzcnt_epi64(k, a); + #else + return simde_mm_maskz_mov_epi64(k, simde_mm_lzcnt_epi64(a)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_lzcnt_epi64 + #define _mm_maskz_lzcnt_epi64(k, a) simde_mm_maskz_lzcnt_epi64(k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_lzcnt_epi64(simde__m256i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm256_lzcnt_epi64(a); + #else + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])); i++) { + r_.m128i[i] = simde_mm_lzcnt_epi64(a_.m128i[i]); + } + + return simde__m256i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm256_lzcnt_epi64 + #define _mm256_lzcnt_epi64(a) simde_mm256_lzcnt_epi64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_mask_lzcnt_epi64(simde__m256i src, simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm256_mask_lzcnt_epi64(src, k, a); + #else + return simde_mm256_mask_mov_epi64(src, k, simde_mm256_lzcnt_epi64(a)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_lzcnt_epi64 + #define _mm256_mask_lzcnt_epi64(src, k, a) simde_mm256_mask_lzcnt_epi64(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_lzcnt_epi64(simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm256_maskz_lzcnt_epi64(k, a); + #else + return simde_mm256_maskz_mov_epi64(k, simde_mm256_lzcnt_epi64(a)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_lzcnt_epi64 + #define _mm256_maskz_lzcnt_epi64(k, a) simde_mm256_maskz_lzcnt_epi64(k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_lzcnt_epi64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm512_lzcnt_epi64(a); + #else + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])); i++) { + r_.m128i[i] = simde_mm_lzcnt_epi64(a_.m128i[i]); + } + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm512_lzcnt_epi64 + #define _mm512_lzcnt_epi64(a) simde_mm512_lzcnt_epi64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_lzcnt_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm512_mask_lzcnt_epi64(src, k, a); + #else + return simde_mm512_mask_mov_epi64(src, k, simde_mm512_lzcnt_epi64(a)); + #endif +} +#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_lzcnt_epi64 + #define _mm512_mask_lzcnt_epi64(src, k, a) simde_mm512_mask_lzcnt_epi64(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_lzcnt_epi64(simde__mmask8 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) + return _mm512_maskz_lzcnt_epi64(k, a); + #else + return simde_mm512_maskz_mov_epi64(k, simde_mm512_lzcnt_epi64(a)); + #endif +} +#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_lzcnt_epi64 + #define _mm512_maskz_lzcnt_epi64(k, a) simde_mm512_maskz_lzcnt_epi64(k, a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/x86/avx512/mov.h b/thirdparty/simde/x86/avx512/mov.h index cee9dbb37..d760fec21 100644 --- a/thirdparty/simde/x86/avx512/mov.h +++ b/thirdparty/simde/x86/avx512/mov.h @@ -60,6 +60,22 @@ simde_mm_mask_mov_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i a) { #define _mm_mask_mov_epi8(src, k, a) simde_mm_mask_mov_epi8(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mask_mov_epu8 (simde__m128i src, simde__mmask16 k, simde__m128i a) { + simde__m128i_private + src_ = simde__m128i_to_private(src), + a_ = simde__m128i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { + r_.u8[i] = ((k >> i) & 1) ? a_.u8[i] : src_.u8[i]; + } + + return simde__m128i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mask_mov_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a) { @@ -84,6 +100,22 @@ simde_mm_mask_mov_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a) { #define _mm_mask_mov_epi16(src, k, a) simde_mm_mask_mov_epi16(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mask_mov_epu16 (simde__m128i src, simde__mmask8 k, simde__m128i a) { + simde__m128i_private + src_ = simde__m128i_to_private(src), + a_ = simde__m128i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = ((k >> i) & 1) ? a_.u16[i] : src_.u16[i]; + } + + return simde__m128i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mask_mov_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a) { @@ -108,6 +140,22 @@ simde_mm_mask_mov_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a) { #define _mm_mask_mov_epi32(src, k, a) simde_mm_mask_mov_epi32(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mask_mov_epu32 (simde__m128i src, simde__mmask8 k, simde__m128i a) { + simde__m128i_private + src_ = simde__m128i_to_private(src), + a_ = simde__m128i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = ((k >> i) & 1) ? a_.u32[i] : src_.u32[i]; + } + + return simde__m128i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mask_mov_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a) { @@ -133,6 +181,22 @@ simde_mm_mask_mov_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a) { #define _mm_mask_mov_epi64(src, k, a) simde_mm_mask_mov_epi64(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_mask_mov_epu64 (simde__m128i src, simde__mmask8 k, simde__m128i a) { + simde__m128i_private + src_ = simde__m128i_to_private(src), + a_ = simde__m128i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = ((k >> i) & 1) ? a_.u64[i] : src_.u64[i]; + } + + return simde__m128i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_mask_mov_pd(simde__m128d src, simde__mmask8 k, simde__m128d a) { @@ -190,6 +254,22 @@ simde_mm256_mask_mov_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i a) { #define _mm256_mask_mov_epi8(src, k, a) simde_mm256_mask_mov_epi8(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_mask_mov_epu8 (simde__m256i src, simde__mmask32 k, simde__m256i a) { + simde__m256i_private + r_, + src_ = simde__m256i_to_private(src), + a_ = simde__m256i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { + r_.u8[i] = ((k >> i) & 1) ? a_.u8[i] : src_.u8[i]; + } + + return simde__m256i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_mov_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i a) { @@ -219,6 +299,22 @@ simde_mm256_mask_mov_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i a) #define _mm256_mask_mov_epi16(src, k, a) simde_mm256_mask_mov_epi16(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_mask_mov_epu16 (simde__m256i src, simde__mmask16 k, simde__m256i a) { + simde__m256i_private + src_ = simde__m256i_to_private(src), + a_ = simde__m256i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = ((k >> i) & 1) ? a_.u16[i] : src_.u16[i]; + } + + return simde__m256i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_mov_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a) { @@ -248,6 +344,22 @@ simde_mm256_mask_mov_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a) { #define _mm256_mask_mov_epi32(src, k, a) simde_mm256_mask_mov_epi32(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_mask_mov_epu32 (simde__m256i src, simde__mmask8 k, simde__m256i a) { + simde__m256i_private + src_ = simde__m256i_to_private(src), + a_ = simde__m256i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = ((k >> i) & 1) ? a_.u32[i] : src_.u32[i]; + } + + return simde__m256i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_mov_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a) { @@ -278,6 +390,22 @@ simde_mm256_mask_mov_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a) { #define _mm256_mask_mov_epi64(src, k, a) simde_mm256_mask_mov_epi64(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_x_mm256_mask_mov_epu64 (simde__m256i src, simde__mmask8 k, simde__m256i a) { + simde__m256i_private + src_ = simde__m256i_to_private(src), + a_ = simde__m256i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = ((k >> i) & 1) ? a_.u64[i] : src_.u64[i]; + } + + return simde__m256i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256d simde_mm256_mask_mov_pd (simde__m256d src, simde__mmask8 k, simde__m256d a) { @@ -335,6 +463,22 @@ simde_mm512_mask_mov_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a) { #define _mm512_mask_mov_epi8(src, k, a) simde_mm512_mask_mov_epi8(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_x_mm512_mask_mov_epu8 (simde__m512i src, simde__mmask64 k, simde__m512i a) { + simde__m512i_private + src_ = simde__m512i_to_private(src), + a_ = simde__m512i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { + r_.u8[i] = ((k >> i) & 1) ? a_.u8[i] : src_.u8[i]; + } + + return simde__m512i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_mov_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a) { @@ -393,6 +537,22 @@ simde_mm512_mask_mov_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a) #define _mm512_mask_mov_epi32(src, k, a) simde_mm512_mask_mov_epi32(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_x_mm512_mask_mov_epu32 (simde__m512i src, simde__mmask16 k, simde__m512i a) { + simde__m512i_private + src_ = simde__m512i_to_private(src), + a_ = simde__m512i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = ((k >> i) & 1) ? a_.u32[i] : src_.u32[i]; + } + + return simde__m512i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_mov_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a) { @@ -423,6 +583,22 @@ simde_mm512_mask_mov_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a) { #define _mm512_mask_mov_epi64(src, k, a) simde_mm512_mask_mov_epi64(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_x_mm512_mask_mov_epu64 (simde__m512i src, simde__mmask8 k, simde__m512i a) { + simde__m512i_private + src_ = simde__m512i_to_private(src), + a_ = simde__m512i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = ((k >> i) & 1) ? a_.u64[i] : src_.u64[i]; + } + + return simde__m512i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_mask_mov_pd (simde__m512d src, simde__mmask8 k, simde__m512d a) { diff --git a/thirdparty/simde/x86/avx512/movm.h b/thirdparty/simde/x86/avx512/movm.h index 452e127ab..d469bfa12 100644 --- a/thirdparty/simde/x86/avx512/movm.h +++ b/thirdparty/simde/x86/avx512/movm.h @@ -31,6 +31,7 @@ #include "types.h" #include "../avx2.h" #include "set.h" +#include "setzero.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS diff --git a/thirdparty/simde/x86/avx512/multishift.h b/thirdparty/simde/x86/avx512/multishift.h index 5388d0d07..1315a0ab3 100644 --- a/thirdparty/simde/x86/avx512/multishift.h +++ b/thirdparty/simde/x86/avx512/multishift.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2023 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_MULTISHIFT_H) #define SIMDE_X86_AVX512_MULTISHIFT_H diff --git a/thirdparty/simde/x86/avx512/permutex.h b/thirdparty/simde/x86/avx512/permutex.h index 91c35cc21..4ac912218 100644 --- a/thirdparty/simde/x86/avx512/permutex.h +++ b/thirdparty/simde/x86/avx512/permutex.h @@ -28,6 +28,7 @@ #define SIMDE_X86_AVX512_PERMUTEX_H #include "types.h" +#include "setzero.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS diff --git a/thirdparty/simde/x86/avx512/permutexvar.h b/thirdparty/simde/x86/avx512/permutexvar.h index 1b4bf7ac6..497500948 100644 --- a/thirdparty/simde/x86/avx512/permutexvar.h +++ b/thirdparty/simde/x86/avx512/permutexvar.h @@ -530,7 +530,19 @@ simde_mm256_permutexvar_pd (simde__m256i idx, simde__m256d a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_permutexvar_pd(idx, a); #else - return simde_mm256_castsi256_pd(simde_mm256_permutexvar_epi64(idx, simde_mm256_castpd_si256(a))); + simde__m256i_private idx_ = simde__m256i_to_private(idx); + simde__m256d_private + a_ = simde__m256d_to_private(a), + r_; + + #if !defined(__INTEL_COMPILER) + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = a_.f64[idx_.i64[i] & 3]; + } + + return simde__m256d_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) diff --git a/thirdparty/simde/x86/avx512/popcnt.h b/thirdparty/simde/x86/avx512/popcnt.h index b3c81253e..a0daf0e7b 100644 --- a/thirdparty/simde/x86/avx512/popcnt.h +++ b/thirdparty/simde/x86/avx512/popcnt.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2021 Evan Nemerson + */ + #if !defined(SIMDE_X86_AVX512_POPCNT_H) #define SIMDE_X86_AVX512_POPCNT_H diff --git a/thirdparty/simde/x86/avx512/range.h b/thirdparty/simde/x86/avx512/range.h index 1d8c0fb49..c6277cfe1 100644 --- a/thirdparty/simde/x86/avx512/range.h +++ b/thirdparty/simde/x86/avx512/range.h @@ -1,3 +1,31 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2021 Evan Nemerson + * 2023 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_RANGE_H) #define SIMDE_X86_AVX512_RANGE_H diff --git a/thirdparty/simde/x86/avx512/range_round.h b/thirdparty/simde/x86/avx512/range_round.h index 7bf132075..c7dc8e7c6 100644 --- a/thirdparty/simde/x86/avx512/range_round.h +++ b/thirdparty/simde/x86/avx512/range_round.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2023 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_RANGE_ROUND_H) #define SIMDE_X86_AVX512_RANGE_ROUND_H diff --git a/thirdparty/simde/x86/avx512/reduce.h b/thirdparty/simde/x86/avx512/reduce.h index c007572e2..501603071 100644 --- a/thirdparty/simde/x86/avx512/reduce.h +++ b/thirdparty/simde/x86/avx512/reduce.h @@ -349,6 +349,266 @@ simde_mm512_reduce_min_ps(simde__m512 a) { # define _mm512_reduce_min_ps(a) simde_mm512_reduce_min_ps((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_mm512_reduce_add_epi32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_add_epi32(a); + #else + simde__m512i_private a_; + int32_t r; + a_ = simde__m512i_to_private(a); + + r = INT32_C(0); + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r += a_.i32[i]; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_add_epi32(a) simde_mm512_reduce_add_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_mm512_reduce_add_epi64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_add_epi64(a); + #else + simde__m512i_private a_; + int64_t r; + a_ = simde__m512i_to_private(a); + + r = INT64_C(0); + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + r += a_.i64[i]; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_add_epi64(a) simde_mm512_reduce_add_epi64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32 +simde_mm512_reduce_add_ps(simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_add_ps(a); + #else + simde__m512_private a_; + simde_float32 r; + a_ = simde__m512_to_private(a); + + /* pairwise tree reduction matching Intel's implementation: + * _mm256_add_ps(lo256, hi256) -> _mm_add_ps(lo128, hi128) + * -> _mm_movehl_ps -> _mm_add_ss */ + simde_float32 t[8], u[4], v[2]; + for (size_t i = 0 ; i < 8 ; i++) { + t[i] = a_.f32[i] + a_.f32[i + 8]; + } + for (size_t i = 0 ; i < 4 ; i++) { + u[i] = t[i] + t[i + 4]; + } + v[0] = u[0] + u[2]; + v[1] = u[1] + u[3]; + r = v[0] + v[1]; + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_add_ps(a) simde_mm512_reduce_add_ps((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64 +simde_mm512_reduce_add_pd(simde__m512d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_add_pd(a); + #else + simde__m512d_private a_; + simde_float64 r; + a_ = simde__m512d_to_private(a); + + r = SIMDE_FLOAT64_C(0.0); + SIMDE_VECTORIZE_REDUCTION(+:r) + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + r += a_.f64[i]; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_add_pd(a) simde_mm512_reduce_add_pd((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16 +simde_mm512_reduce_add_ph(simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_reduce_add_ph(a); + #else + simde__m512h_private a_; + simde_float16 r; + a_ = simde__m512h_to_private(a); + + /* pairwise tree reduction, each step via float32 to simulate float16 add + * (upcasts to float32, adds, rounds back to float16) */ + simde_float16 t[16], u[8], v[4], w[2]; + for (size_t i = 0 ; i < 16 ; i++) { + t[i] = simde_float16_from_float32(simde_float16_to_float32(a_.f16[i]) + simde_float16_to_float32(a_.f16[i + 16])); + } + for (size_t i = 0 ; i < 8 ; i++) { + u[i] = simde_float16_from_float32(simde_float16_to_float32(t[i]) + simde_float16_to_float32(t[i + 8])); + } + for (size_t i = 0 ; i < 4 ; i++) { + v[i] = simde_float16_from_float32(simde_float16_to_float32(u[i]) + simde_float16_to_float32(u[i + 4])); + } + w[0] = simde_float16_from_float32(simde_float16_to_float32(v[0]) + simde_float16_to_float32(v[2])); + w[1] = simde_float16_from_float32(simde_float16_to_float32(v[1]) + simde_float16_to_float32(v[3])); + r = simde_float16_from_float32(simde_float16_to_float32(w[0]) + simde_float16_to_float32(w[1])); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_add_ph(a) simde_mm512_reduce_add_ph((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_mm512_reduce_and_epi32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_and_epi32(a); + #else + simde__m512i_private a_; + int32_t r; + a_ = simde__m512i_to_private(a); + + r = ~INT32_C(0); + SIMDE_VECTORIZE_REDUCTION(&:r) + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r &= a_.i32[i]; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_and_epi32(a) simde_mm512_reduce_and_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_mm512_reduce_and_epi64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_and_epi64(a); + #else + simde__m512i_private a_; + int64_t r; + a_ = simde__m512i_to_private(a); + + r = ~INT64_C(0); + SIMDE_VECTORIZE_REDUCTION(&:r) + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + r &= a_.i64[i]; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_and_epi64(a) simde_mm512_reduce_and_epi64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_mm512_reduce_mul_epi32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_mul_epi32(a); + #else + simde__m512i_private a_; + int32_t r; + a_ = simde__m512i_to_private(a); + + r = INT32_C(1); + SIMDE_VECTORIZE_REDUCTION(*:r) + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r *= a_.i32[i]; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_mul_epi32(a) simde_mm512_reduce_mul_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_mm512_reduce_mul_epi64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_mul_epi64(a); + #else + simde__m512i_private a_; + int64_t r; + a_ = simde__m512i_to_private(a); + + r = INT64_C(1); + SIMDE_VECTORIZE_REDUCTION(*:r) + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + r *= a_.i64[i]; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_mul_epi64(a) simde_mm512_reduce_mul_epi64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_mm512_reduce_or_epi32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_or_epi32(a); + #else + simde__m512i_private a_; + int32_t r; + a_ = simde__m512i_to_private(a); + + r = INT32_C(0); + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r |= a_.i32[i]; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_or_epi32(a) simde_mm512_reduce_or_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_mm512_reduce_or_epi64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_or_epi64(a); + #else + simde__m512i_private a_; + int64_t r; + a_ = simde__m512i_to_private(a); + + r = INT64_C(0); + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + r |= a_.i64[i]; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_or_epi64(a) simde_mm512_reduce_or_epi64((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/x86/avx512/rol.h b/thirdparty/simde/x86/avx512/rol.h index 5bdf98bc1..74b1f6572 100644 --- a/thirdparty/simde/x86/avx512/rol.h +++ b/thirdparty/simde/x86/avx512/rol.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2023 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_ROL_H) #define SIMDE_X86_AVX512_ROL_H diff --git a/thirdparty/simde/x86/avx512/rolv.h b/thirdparty/simde/x86/avx512/rolv.h index a14442ff9..8dc4e68cb 100644 --- a/thirdparty/simde/x86/avx512/rolv.h +++ b/thirdparty/simde/x86/avx512/rolv.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_ROLV_H) #define SIMDE_X86_AVX512_ROLV_H diff --git a/thirdparty/simde/x86/avx512/ror.h b/thirdparty/simde/x86/avx512/ror.h index 7cac56c7e..e39da562b 100644 --- a/thirdparty/simde/x86/avx512/ror.h +++ b/thirdparty/simde/x86/avx512/ror.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2023 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_ROR_H) #define SIMDE_X86_AVX512_ROR_H diff --git a/thirdparty/simde/x86/avx512/rorv.h b/thirdparty/simde/x86/avx512/rorv.h index ae87cec84..00e59bd75 100644 --- a/thirdparty/simde/x86/avx512/rorv.h +++ b/thirdparty/simde/x86/avx512/rorv.h @@ -1,3 +1,29 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + */ + #if !defined(SIMDE_X86_AVX512_RORV_H) #define SIMDE_X86_AVX512_RORV_H diff --git a/thirdparty/simde/x86/avx512/round.h b/thirdparty/simde/x86/avx512/round.h index 684dbe045..4adb479b5 100644 --- a/thirdparty/simde/x86/avx512/round.h +++ b/thirdparty/simde/x86/avx512/round.h @@ -1,7 +1,36 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2021 Christopher Moore + * 2023-2025 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_ROUND_H) #define SIMDE_X86_AVX512_ROUND_H #include "types.h" +#include "setzero.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS diff --git a/thirdparty/simde/x86/avx512/roundscale.h b/thirdparty/simde/x86/avx512/roundscale.h index 80c9abf2b..33d380e3f 100644 --- a/thirdparty/simde/x86/avx512/roundscale.h +++ b/thirdparty/simde/x86/avx512/roundscale.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2023-2025 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_ROUNDSCALE_H) #define SIMDE_X86_AVX512_ROUNDSCALE_H @@ -7,6 +34,7 @@ #include "mul.h" #include "round.h" #include "cmpeq.h" +#include "setzero.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS diff --git a/thirdparty/simde/x86/avx512/roundscale_round.h b/thirdparty/simde/x86/avx512/roundscale_round.h index f941e48da..a3912ef54 100644 --- a/thirdparty/simde/x86/avx512/roundscale_round.h +++ b/thirdparty/simde/x86/avx512/roundscale_round.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2023 Michael R. Crusoe + */ + #if !defined(SIMDE_X86_AVX512_ROUNDSCALE_ROUND_H) #define SIMDE_X86_AVX512_ROUNDSCALE_ROUND_H diff --git a/thirdparty/simde/x86/avx512/scalef.h b/thirdparty/simde/x86/avx512/scalef.h index 116733175..e5a2868e7 100644 --- a/thirdparty/simde/x86/avx512/scalef.h +++ b/thirdparty/simde/x86/avx512/scalef.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2022 Evan Nemerson + */ + #if !defined(SIMDE_X86_AVX512_SCALEF_H) #define SIMDE_X86_AVX512_SCALEF_H diff --git a/thirdparty/simde/x86/avx512/shldv.h b/thirdparty/simde/x86/avx512/shldv.h index 1cd38f1f6..2a289886f 100644 --- a/thirdparty/simde/x86/avx512/shldv.h +++ b/thirdparty/simde/x86/avx512/shldv.h @@ -1,3 +1,30 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Kunwar Maheep Singh + * 2021 Evan Nemerson + */ + #if !defined(SIMDE_X86_AVX512_SHLDV_H) #define SIMDE_X86_AVX512_SHLDV_H diff --git a/thirdparty/simde/x86/avx512/shuffle.h b/thirdparty/simde/x86/avx512/shuffle.h index d1c537f34..14faee1bf 100644 --- a/thirdparty/simde/x86/avx512/shuffle.h +++ b/thirdparty/simde/x86/avx512/shuffle.h @@ -97,7 +97,7 @@ simde_mm512_maskz_shuffle_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b #endif #if defined(SIMDE_X86_AVX512F_NATIVE) -# define simde_mm512_shuffle_epi32(a, imm8) _mm512_shuffle_epi32((a), (imm8)) +# define simde_mm512_shuffle_epi32(a, imm8) _mm512_shuffle_epi32((a), HEDLEY_STATIC_CAST(_MM_PERM_ENUM, (imm8))) #elif defined(SIMDE_STATEMENT_EXPR_) # define simde_mm512_shuffle_epi32(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512i_private simde_mm512_shuffle_epi32_r_, \ diff --git a/thirdparty/simde/x86/avx512/slli.h b/thirdparty/simde/x86/avx512/slli.h index d2ad75b7a..a7aa91bd6 100644 --- a/thirdparty/simde/x86/avx512/slli.h +++ b/thirdparty/simde/x86/avx512/slli.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Hidayat Khan * 2020 Christopher Moore + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_SLLI_H) @@ -38,6 +39,46 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_mask_slli_epi16(src, k, a, imm8) _mm_mask_slli_epi16(src, k, a, imm8) +#else + #define simde_mm_mask_slli_epi16(src, k, a, imm8) simde_mm_mask_mov_epi16(src, k, simde_mm_slli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_slli_epi16 + #define _mm_mask_slli_epi16(src, k, a, imm8) simde_mm_mask_slli_epi16(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_maskz_slli_epi16(k, a, imm8) _mm_maskz_slli_epi16(k, a, imm8) +#else + #define simde_mm_maskz_slli_epi16(k, a, imm8) simde_mm_maskz_mov_epi16((k), simde_mm_slli_epi16((a), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_slli_epi16 + #define _mm_maskz_slli_epi16(k, a, imm8) simde_mm_maskz_slli_epi16(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_mask_slli_epi16(src, k, a, imm8) _mm256_mask_slli_epi16(src, k, a, imm8) +#else + #define simde_mm256_mask_slli_epi16(src, k, a, imm8) simde_mm256_mask_mov_epi16(src, k, simde_mm256_slli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_slli_epi16 + #define _mm256_mask_slli_epi16(src, k, a, imm8) simde_mm256_mask_slli_epi16(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_maskz_slli_epi16(k, a, imm8) _mm256_maskz_slli_epi16(k, a, imm8) +#else + #define simde_mm256_maskz_slli_epi16(k, a, imm8) simde_mm256_maskz_mov_epi16(k, simde_mm256_slli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_slli_epi16 + #define _mm256_maskz_slli_epi16(k, a, imm8) simde_mm256_maskz_slli_epi16(k, a, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_slli_epi16 (simde__m512i a, const unsigned int imm8) @@ -75,6 +116,66 @@ simde_mm512_slli_epi16 (simde__m512i a, const unsigned int imm8) #define _mm512_slli_epi16(a, imm8) simde_mm512_slli_epi16(a, imm8) #endif +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_mask_slli_epi16(src, k, a, imm8) _mm512_mask_slli_epi16(src, k, a, imm8) +#else + #define simde_mm512_mask_slli_epi16(src, k, a, imm8) simde_mm512_mask_mov_epi16(src, k, simde_mm512_slli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_slli_epi16 + #define _mm512_mask_slli_epi16(src, k, a, imm8) simde_mm512_mask_slli_epi16(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_maskz_slli_epi16(k, a, imm8) _mm512_maskz_slli_epi16(k, a, imm8) +#else + #define simde_mm512_maskz_slli_epi16(k, a, imm8) simde_mm512_maskz_mov_epi16(k, simde_mm512_slli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_slli_epi16 + #define _mm512_maskz_slli_epi16(k, a, imm8) simde_mm512_maskz_slli_epi16(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_mask_slli_epi32(src, k, a, imm8) _mm_mask_slli_epi32(src, k, a, imm8) +#else + #define simde_mm_mask_slli_epi32(src, k, a, imm8) simde_mm_mask_mov_epi32(src, k, simde_mm_slli_epi32(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_slli_epi32 + #define _mm_mask_slli_epi32(src, k, a, imm8) simde_mm_mask_slli_epi32(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_maskz_slli_epi32(k, a, imm8) _mm_maskz_slli_epi32(k, a, imm8) +#else + #define simde_mm_maskz_slli_epi32(k, a, imm8) simde_mm_maskz_mov_epi32(k, simde_mm_slli_epi32(a, HEDLEY_STATIC_CAST(int, imm8))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_slli_epi32 + #define _mm_maskz_slli_epi32(k, a, imm8) simde_mm_maskz_slli_epi32(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_mask_slli_epi32(src, k, a, imm8) _mm256_mask_slli_epi32(src, k, a, imm8) +#else + #define simde_mm256_mask_slli_epi32(src, k, a, imm8) simde_mm256_mask_mov_epi32(src, k, simde_mm256_slli_epi32(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_slli_epi32 + #define _mm256_mask_slli_epi32(src, k, a, imm8) simde_mm256_mask_slli_epi32(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_maskz_slli_epi32(k, a, imm8) _mm256_maskz_slli_epi32(k, a, imm8) +#else + #define simde_mm256_maskz_slli_epi32(k, a, imm8) simde_mm256_maskz_mov_epi32(k, simde_mm256_slli_epi32(a, HEDLEY_STATIC_CAST(int, imm8))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_slli_epi32 + #define _mm256_maskz_slli_epi32(k, a, imm8) simde_mm256_maskz_slli_epi32(k, a, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_slli_epi32 (simde__m512i a, unsigned int imm8) { @@ -125,6 +226,66 @@ simde_mm512_slli_epi32 (simde__m512i a, unsigned int imm8) { #define _mm512_slli_epi32(a, imm8) simde_mm512_slli_epi32(a, imm8) #endif +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_slli_epi32(src, k, a, imm8) _mm512_mask_slli_epi32(src, k, a, imm8) +#else + #define simde_mm512_mask_slli_epi32(src, k, a, imm8) simde_mm512_mask_mov_epi32(src, k, simde_mm512_slli_epi32(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_slli_epi32 + #define _mm512_mask_slli_epi32(src, k, a, imm8) simde_mm512_mask_slli_epi32(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_maskz_slli_epi32(k, a, imm8) _mm512_maskz_slli_epi32(k, a, imm8) +#else + #define simde_mm512_maskz_slli_epi32(k, a, imm8) simde_mm512_maskz_mov_epi32(k, simde_mm512_slli_epi32(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_slli_epi32 + #define _mm512_maskz_slli_epi32(k, a, imm8) simde_mm512_maskz_slli_epi32(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_mask_slli_epi64(src, k, a, imm8) _mm_mask_slli_epi64(src, k, a, imm8) +#else + #define simde_mm_mask_slli_epi64(src, k, a, imm8) simde_mm_mask_mov_epi64(src, k, simde_mm_slli_epi64(a, HEDLEY_STATIC_CAST(int, imm8))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_slli_epi64 + #define _mm_mask_slli_epi64(src, k, a, imm8) simde_mm_mask_slli_epi64(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_maskz_slli_epi64(k, a, imm8) _mm_maskz_slli_epi64(k, a, imm8) +#else + #define simde_mm_maskz_slli_epi64(k, a, imm8) simde_mm_maskz_mov_epi64(k, simde_mm_slli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_slli_epi64 + #define _mm_maskz_slli_epi64(k, a, imm8) simde_mm_maskz_slli_epi64(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_mask_slli_epi64(src, k, a, imm8) _mm256_mask_slli_epi64(src, k, a, imm8) +#else + #define simde_mm256_mask_slli_epi64(src, k, a, imm8) simde_mm256_mask_mov_epi64(src, k, simde_mm256_slli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_slli_epi64 + #define _mm256_mask_slli_epi64(src, k, a, imm8) simde_mm256_mask_slli_epi64(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_maskz_slli_epi64(k, a, imm8) _mm256_maskz_slli_epi64(k, a, imm8) +#else + #define simde_mm256_maskz_slli_epi64(k, a, imm8) simde_mm256_maskz_mov_epi64(k, simde_mm256_slli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_slli_epi64 + #define _mm256_maskz_slli_epi64(k, a, imm8) simde_mm256_maskz_slli_epi64(k, a, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_slli_epi64 (simde__m512i a, unsigned int imm8) { @@ -155,7 +316,7 @@ simde_mm512_slli_epi64 (simde__m512i a, unsigned int imm8) { r_.m128i[1] = simde_mm_slli_epi64(a_.m128i[1], HEDLEY_STATIC_CAST(int, imm8)); r_.m128i[2] = simde_mm_slli_epi64(a_.m128i[2], HEDLEY_STATIC_CAST(int, imm8)); r_.m128i[3] = simde_mm_slli_epi64(a_.m128i[3], HEDLEY_STATIC_CAST(int, imm8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) && !defined(SIMDE_BUG_GCC_123807) r_.u64 = a_.u64 << imm8; #else SIMDE_VECTORIZE @@ -173,6 +334,26 @@ simde_mm512_slli_epi64 (simde__m512i a, unsigned int imm8) { #define _mm512_slli_epi64(a, imm8) simde_mm512_slli_epi64(a, imm8) #endif +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_slli_epi64(src, k, a, imm8) _mm512_mask_slli_epi64(src, k, a, imm8) +#else + #define simde_mm512_mask_slli_epi64(src, k, a, imm8) simde_mm512_mask_mov_epi64(src, k, simde_mm512_slli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_slli_epi64 + #define _mm512_mask_slli_epi64(src, k, a, imm8) simde_mm512_mask_slli_epi64(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_maskz_slli_epi64(k, a, imm8) _mm512_maskz_slli_epi64(k, a, imm8) +#else + #define simde_mm512_maskz_slli_epi64(k, a, imm8) simde_mm512_maskz_mov_epi64(k, simde_mm512_slli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_slli_epi64 + #define _mm512_maskz_slli_epi64(k, a, imm8) simde_mm512_maskz_slli_epi64(k, a, imm8) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/x86/avx512/srli.h b/thirdparty/simde/x86/avx512/srli.h index f240693b4..b0855db40 100644 --- a/thirdparty/simde/x86/avx512/srli.h +++ b/thirdparty/simde/x86/avx512/srli.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Hidayat Khan + * 2025 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_SRLI_H) @@ -37,9 +38,49 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_mask_srli_epi16(src, k, a, imm8) _mm_mask_srli_epi16(src, k, a, imm8) +#else + #define simde_mm_mask_srli_epi16(src, k, a, imm8) simde_mm_mask_mov_epi16(src, k, simde_mm_srli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_srli_epi16 + #define _mm_mask_srli_epi16(src, k, a, imm8) simde_mm_mask_srli_epi16(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_maskz_srli_epi16(k, a, imm8) _mm_maskz_srli_epi16(k, a, imm8) +#else + #define simde_mm_maskz_srli_epi16(k, a, imm8) simde_mm_maskz_mov_epi16((k), simde_mm_srli_epi16((a), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_srli_epi16 + #define _mm_maskz_srli_epi16(k, a, imm8) simde_mm_maskz_srli_epi16(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_mask_srli_epi16(src, k, a, imm8) _mm256_mask_srli_epi16(src, k, a, imm8) +#else + #define simde_mm256_mask_srli_epi16(src, k, a, imm8) simde_mm256_mask_mov_epi16(src, k, simde_mm256_srli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_srli_epi16 + #define _mm256_mask_srli_epi16(src, k, a, imm8) simde_mm256_mask_srli_epi16(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_maskz_srli_epi16(k, a, imm8) _mm256_maskz_srli_epi16(k, a, imm8) +#else + #define simde_mm256_maskz_srli_epi16(k, a, imm8) simde_mm256_maskz_mov_epi16(k, simde_mm256_srli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_srli_epi16 + #define _mm256_maskz_srli_epi16(k, a, imm8) simde_mm256_maskz_srli_epi16(k, a, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i -simde_mm512_srli_epi16 (simde__m512i a, const unsigned int imm8) +simde_mm512_srli_epi16(simde__m512i a, const unsigned int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { #if defined(SIMDE_X86_AVX512BW_NATIVE) && (defined(HEDLEY_GCC_VERSION) && ((__GNUC__ == 5 && __GNUC_MINOR__ == 5) || (__GNUC__ == 6 && __GNUC_MINOR__ >= 4))) simde__m512i r; @@ -77,9 +118,69 @@ simde_mm512_srli_epi16 (simde__m512i a, const unsigned int imm8) #define _mm512_srli_epi16(a, imm8) simde_mm512_srli_epi16(a, imm8) #endif +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_mask_srli_epi16(src, k, a, imm8) _mm512_mask_srli_epi16(src, k, a, imm8) +#else + #define simde_mm512_mask_srli_epi16(src, k, a, imm8) simde_mm512_mask_mov_epi16(src, k, simde_mm512_srli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_srli_epi16 + #define _mm512_mask_srli_epi16(src, k, a, imm8) simde_mm512_mask_srli_epi16(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_maskz_srli_epi16(k, a, imm8) _mm512_maskz_srli_epi16(k, a, imm8) +#else + #define simde_mm512_maskz_srli_epi16(k, a, imm8) simde_mm512_maskz_mov_epi16(k, simde_mm512_srli_epi16(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_srli_epi16 + #define _mm512_maskz_srli_epi16(k, a, imm8) simde_mm512_maskz_srli_epi16(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_mask_srli_epi32(src, k, a, imm8) _mm_mask_srli_epi32(src, k, a, imm8) +#else + #define simde_mm_mask_srli_epi32(src, k, a, imm8) simde_mm_mask_mov_epi32(src, k, simde_mm_srli_epi32(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_srli_epi32 + #define _mm_mask_srli_epi32(src, k, a, imm8) simde_mm_mask_srli_epi32(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_maskz_srli_epi32(k, a, imm8) _mm_maskz_srli_epi32(k, a, imm8) +#else + #define simde_mm_maskz_srli_epi32(k, a, imm8) simde_mm_maskz_mov_epi32(k, simde_mm_srli_epi32(a, HEDLEY_STATIC_CAST(int, imm8))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_srli_epi32 + #define _mm_maskz_srli_epi32(k, a, imm8) simde_mm_maskz_srli_epi32(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_mask_srli_epi32(src, k, a, imm8) _mm256_mask_srli_epi32(src, k, a, imm8) +#else + #define simde_mm256_mask_srli_epi32(src, k, a, imm8) simde_mm256_mask_mov_epi32(src, k, simde_mm256_srli_epi32(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_srli_epi32 + #define _mm256_mask_srli_epi32(src, k, a, imm8) simde_mm256_mask_srli_epi32(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_maskz_srli_epi32(k, a, imm8) _mm256_maskz_srli_epi32(k, a, imm8) +#else + #define simde_mm256_maskz_srli_epi32(k, a, imm8) simde_mm256_maskz_mov_epi32(k, simde_mm256_srli_epi32(a, HEDLEY_STATIC_CAST(int, imm8))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_srli_epi32 + #define _mm256_maskz_srli_epi32(k, a, imm8) simde_mm256_maskz_srli_epi32(k, a, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i -simde_mm512_srli_epi32 (simde__m512i a, unsigned int imm8) { +simde_mm512_srli_epi32(simde__m512i a, unsigned int imm8) { #if defined(SIMDE_X86_AVX512F_NATIVE) && (defined(HEDLEY_GCC_VERSION) && ((__GNUC__ == 5 && __GNUC_MINOR__ == 5) || (__GNUC__ == 6 && __GNUC_MINOR__ >= 4))) simde__m512i r; @@ -124,9 +225,69 @@ simde_mm512_srli_epi32 (simde__m512i a, unsigned int imm8) { #define _mm512_srli_epi32(a, imm8) simde_mm512_srli_epi32(a, imm8) #endif +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_srli_epi32(src, k, a, imm8) _mm512_mask_srli_epi32(src, k, a, imm8) +#else + #define simde_mm512_mask_srli_epi32(src, k, a, imm8) simde_mm512_mask_mov_epi32(src, k, simde_mm512_srli_epi32(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_srli_epi32 + #define _mm512_mask_srli_epi32(src, k, a, imm8) simde_mm512_mask_srli_epi32(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_maskz_srli_epi32(k, a, imm8) _mm512_maskz_srli_epi32(k, a, imm8) +#else + #define simde_mm512_maskz_srli_epi32(k, a, imm8) simde_mm512_maskz_mov_epi32(k, simde_mm512_srli_epi32(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_srli_epi32 + #define _mm512_maskz_srli_epi32(k, a, imm8) simde_mm512_maskz_srli_epi32(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_mask_srli_epi64(src, k, a, imm8) _mm_mask_srli_epi64(src, k, a, imm8) +#else + #define simde_mm_mask_srli_epi64(src, k, a, imm8) simde_mm_mask_mov_epi64(src, k, simde_mm_srli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_srli_epi64 + #define _mm_mask_srli_epi64(src, k, a, imm8) simde_mm_mask_srli_epi64(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_maskz_srli_epi64(k, a, imm8) _mm_maskz_srli_epi64(k, a, imm8) +#else + #define simde_mm_maskz_srli_epi64(k, a, imm8) simde_mm_maskz_mov_epi64(k, simde_mm_srli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_srli_epi64 + #define _mm_maskz_srli_epi64(k, a, imm8) simde_mm_maskz_srli_epi64(k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_mask_srli_epi64(src, k, a, imm8) _mm256_mask_srli_epi64(src, k, a, imm8) +#else + #define simde_mm256_mask_srli_epi64(src, k, a, imm8) simde_mm256_mask_mov_epi64(src, k, simde_mm256_srli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_srli_epi64 + #define _mm256_mask_srli_epi64(src, k, a, imm8) simde_mm256_mask_srli_epi64(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_maskz_srli_epi64(k, a, imm8) _mm256_maskz_srli_epi64(k, a, imm8) +#else + #define simde_mm256_maskz_srli_epi64(k, a, imm8) simde_mm256_maskz_mov_epi64(k, simde_mm256_srli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_srli_epi64 + #define _mm256_maskz_srli_epi64(k, a, imm8) simde_mm256_maskz_srli_epi64(k, a, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i -simde_mm512_srli_epi64 (simde__m512i a, unsigned int imm8) { +simde_mm512_srli_epi64(simde__m512i a, unsigned int imm8) { #if defined(SIMDE_X86_AVX512F_NATIVE) && (defined(HEDLEY_GCC_VERSION) && ((__GNUC__ == 5 && __GNUC_MINOR__ == 5) || (__GNUC__ == 6 && __GNUC_MINOR__ >= 4))) simde__m512i r; @@ -155,7 +316,7 @@ simde_mm512_srli_epi64 (simde__m512i a, unsigned int imm8) { if (imm8 > 63) { simde_memset(&r_, 0, sizeof(r_)); } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) + #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) && !defined(SIMDE_BUG_GCC_123807) r_.u64 = a_.u64 >> imm8; #else SIMDE_VECTORIZE @@ -174,6 +335,26 @@ simde_mm512_srli_epi64 (simde__m512i a, unsigned int imm8) { #define _mm512_srli_epi64(a, imm8) simde_mm512_srli_epi64(a, imm8) #endif +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_srli_epi64(src, k, a, imm8) _mm512_mask_srli_epi64(src, k, a, imm8) +#else + #define simde_mm512_mask_srli_epi64(src, k, a, imm8) simde_mm512_mask_mov_epi64(src, k, simde_mm512_srli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_srli_epi64 + #define _mm512_mask_srli_epi64(src, k, a, imm8) simde_mm512_mask_srli_epi64(src, k, a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_maskz_srli_epi64(k, a, imm8) _mm512_maskz_srli_epi64(k, a, imm8) +#else + #define simde_mm512_maskz_srli_epi64(k, a, imm8) simde_mm512_maskz_mov_epi64(k, simde_mm512_srli_epi64(a, imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_srli_epi64 + #define _mm512_maskz_srli_epi64(k, a, imm8) simde_mm512_maskz_srli_epi64(k, a, imm8) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/x86/avx512/storeu.h b/thirdparty/simde/x86/avx512/storeu.h index e00801faf..940683453 100644 --- a/thirdparty/simde/x86/avx512/storeu.h +++ b/thirdparty/simde/x86/avx512/storeu.h @@ -29,12 +29,134 @@ #include "types.h" #include "mov.h" -#include "setzero.h" +#include "loadu.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#define simde_mm_storeu_epi8(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) +#define simde_mm_storeu_epi16(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) +#define simde_mm_storeu_epi32(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) +#define simde_mm_storeu_epi64(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) + +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_storeu_epi8 + #undef _mm_storeu_epi16 + #define _mm_storeu_epi8(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) + #define _mm_storeu_epi16(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_storeu_epi32 + #undef _mm_storeu_epi64 + #define _mm_storeu_epi32(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) + #define _mm_storeu_epi64(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_storeu_epi8 (void * mem_addr, simde__mmask16 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm_mask_storeu_epi8(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m128i src = simde_mm_loadu_epi8(mem_addr); + simde_mm_storeu_epi8(mem_addr, simde_mm_mask_mov_epi8(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_storeu_epi8 + #define _mm_mask_storeu_epi8(mem_addr, k, a) simde_mm_mask_storeu_epi8(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_storeu_epi16 (void * mem_addr, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm_mask_storeu_epi16(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m128i src = simde_mm_loadu_epi16(mem_addr); + simde_mm_storeu_epi16(mem_addr, simde_mm_mask_mov_epi16(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_storeu_epi16 + #define _mm_mask_storeu_epi16(mem_addr, k, a) simde_mm_mask_storeu_epi16(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_storeu_epi32 (void * mem_addr, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm_mask_storeu_epi32(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m128i src = simde_mm_loadu_epi32(mem_addr); + simde_mm_storeu_epi32(mem_addr, simde_mm_mask_mov_epi32(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_storeu_epi32 + #define _mm_mask_storeu_epi32(mem_addr, k, a) simde_mm_mask_storeu_epi32(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_storeu_epi64 (void * mem_addr, simde__mmask8 k, simde__m128i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm_mask_storeu_epi64(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m128i src = simde_mm_loadu_epi64(mem_addr); + simde_mm_storeu_epi64(mem_addr, simde_mm_mask_mov_epi64(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_storeu_epi64 + #define _mm_mask_storeu_epi64(mem_addr, k, a) simde_mm_mask_storeu_epi64(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_storeu_ph (void * mem_addr, simde__m128h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm_storeu_ph(mem_addr, a); + #else + simde_memcpy(mem_addr, &a, sizeof(a)); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_storeu_ph + #define _mm_storeu_ph(mem_addr, a) simde_mm_storeu_ph(mem_addr, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_storeu_ps(void * mem_addr, simde__mmask8 k, simde__m128 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm_mask_storeu_ps(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m128 src = simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(const simde_float32*, mem_addr)); + simde_mm_storeu_ps(HEDLEY_REINTERPRET_CAST(simde_float32*, mem_addr), simde_mm_mask_mov_ps(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_storeu_ps + #define _mm_mask_storeu_ps(mem_addr, k, a) simde_mm_mask_storeu_ps(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm_mask_storeu_pd(void * mem_addr, simde__mmask8 k, simde__m128d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm_mask_storeu_pd(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m128d src = simde_mm_loadu_pd(HEDLEY_REINTERPRET_CAST(const simde_float64*, mem_addr)); + simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(simde_float64*, mem_addr), simde_mm_mask_mov_pd(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_storeu_pd + #define _mm_mask_storeu_pd(mem_addr, k, a) simde_mm_mask_storeu_pd(mem_addr, k, a) +#endif + #define simde_mm256_storeu_epi8(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) #define simde_mm256_storeu_epi16(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) #define simde_mm256_storeu_epi32(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) @@ -52,14 +174,43 @@ SIMDE_BEGIN_DECLS_ #define _mm256_storeu_epi64(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm256_storeu_ph (void * mem_addr, simde__m256h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm256_storeu_ph(mem_addr, a); + #else + simde_memcpy(mem_addr, &a, sizeof(a)); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_storeu_ph + #define _mm256_storeu_ph(mem_addr, a) simde_mm256_storeu_ph(mem_addr, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm256_mask_storeu_epi8 (void * mem_addr, simde__mmask32 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm256_mask_storeu_epi8(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m256i src = simde_mm256_loadu_epi8(mem_addr); + simde_mm256_storeu_epi8(mem_addr, simde_mm256_mask_mov_epi8(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_storeu_epi8 + #define _mm256_mask_storeu_epi8(mem_addr, k, a) simde_mm256_mask_storeu_epi8(mem_addr, k, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_mm256_mask_storeu_epi16 (void * mem_addr, simde__mmask16 k, simde__m256i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) _mm256_mask_storeu_epi16(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); #else - const simde__m256i zero = simde_mm256_setzero_si256(); - simde_mm256_storeu_epi16(mem_addr, simde_mm256_mask_mov_epi16(zero, k, a)); + const simde__m256i src = simde_mm256_loadu_epi16(mem_addr); + simde_mm256_storeu_epi16(mem_addr, simde_mm256_mask_mov_epi16(src, k, a)); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) @@ -67,6 +218,66 @@ simde_mm256_mask_storeu_epi16 (void * mem_addr, simde__mmask16 k, simde__m256i a #define _mm256_mask_storeu_epi16(mem_addr, k, a) simde_mm256_mask_storeu_epi16(mem_addr, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm256_mask_storeu_epi32 (void * mem_addr, simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm256_mask_storeu_epi32(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m256i src = simde_mm256_loadu_epi32(mem_addr); + simde_mm256_storeu_epi32(mem_addr, simde_mm256_mask_mov_epi32(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_storeu_epi32 + #define _mm256_mask_storeu_epi32(mem_addr, k, a) simde_mm256_mask_storeu_epi32(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm256_mask_storeu_epi64 (void * mem_addr, simde__mmask8 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm256_mask_storeu_epi64(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m256i src = simde_mm256_loadu_epi64(mem_addr); + simde_mm256_storeu_epi64(mem_addr, simde_mm256_mask_mov_epi64(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_storeu_epi64 + #define _mm256_mask_storeu_epi64(mem_addr, k, a) simde_mm256_mask_storeu_epi64(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm256_mask_storeu_ps (void * mem_addr, simde__mmask8 k, simde__m256 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm256_mask_storeu_ps(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m256 src = simde_mm256_loadu_ps(HEDLEY_REINTERPRET_CAST(const simde_float32*, mem_addr)); + simde_mm256_storeu_ps(HEDLEY_REINTERPRET_CAST(simde_float32*, mem_addr), simde_mm256_mask_mov_ps(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_storeu_ps + #define _mm256_mask_storeu_ps(mem_addr, k, a) simde_mm256_mask_storeu_ps(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm256_mask_storeu_pd (void * mem_addr, simde__mmask8 k, simde__m256d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm256_mask_storeu_pd(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m256d src = simde_mm256_loadu_pd(HEDLEY_REINTERPRET_CAST(const simde_float64*, mem_addr)); + simde_mm256_storeu_pd(HEDLEY_REINTERPRET_CAST(simde_float64*, mem_addr), simde_mm256_mask_mov_pd(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_storeu_pd + #define _mm256_mask_storeu_pd(mem_addr, k, a) simde_mm256_mask_storeu_pd(mem_addr, k, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_storeu_ps (void * mem_addr, simde__m512 a) { @@ -137,14 +348,29 @@ simde_mm512_storeu_si512 (void * mem_addr, simde__m512i a) { #define _mm512_storeu_epi64(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_epi8 (void * mem_addr, simde__mmask64 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + _mm512_mask_storeu_epi8(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512i src = simde_mm512_loadu_epi8(mem_addr); + simde_mm512_storeu_epi8(mem_addr, simde_mm512_mask_mov_epi8(src, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_epi8 + #define _mm512_mask_storeu_epi8(mem_addr, k, a) simde_mm512_mask_storeu_epi8(mem_addr, k, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_mask_storeu_epi16 (void * mem_addr, simde__mmask32 k, simde__m512i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) _mm512_mask_storeu_epi16(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); #else - const simde__m512i zero = simde_mm512_setzero_si512(); - simde_mm512_storeu_epi16(mem_addr, simde_mm512_mask_mov_epi16(zero, k, a)); + const simde__m512i src = simde_mm512_loadu_epi16(mem_addr); + simde_mm512_storeu_epi16(mem_addr, simde_mm512_mask_mov_epi16(src, k, a)); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) @@ -158,8 +384,8 @@ simde_mm512_mask_storeu_epi32 (void * mem_addr, simde__mmask16 k, simde__m512i a #if defined(SIMDE_X86_AVX512F_NATIVE) _mm512_mask_storeu_epi32(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); #else - const simde__m512i zero = simde_mm512_setzero_si512(); - simde_mm512_storeu_epi32(mem_addr, simde_mm512_mask_mov_epi32(zero, k, a)); + const simde__m512i src = simde_mm512_loadu_epi32(mem_addr); + simde_mm512_storeu_epi32(mem_addr, simde_mm512_mask_mov_epi32(src, k, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) @@ -173,8 +399,8 @@ simde_mm512_mask_storeu_epi64 (void * mem_addr, simde__mmask8 k, simde__m512i a) #if defined(SIMDE_X86_AVX512F_NATIVE) _mm512_mask_storeu_epi64(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); #else - const simde__m512i zero = simde_mm512_setzero_si512(); - simde_mm512_storeu_epi64(mem_addr, simde_mm512_mask_mov_epi64(zero, k, a)); + const simde__m512i src = simde_mm512_loadu_epi64(mem_addr); + simde_mm512_storeu_epi64(mem_addr, simde_mm512_mask_mov_epi64(src, k, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) @@ -188,8 +414,8 @@ simde_mm512_mask_storeu_ps (void * mem_addr, simde__mmask16 k, simde__m512 a) { #if defined(SIMDE_X86_AVX512F_NATIVE) _mm512_mask_storeu_ps(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); #else - const simde__m512 zero = simde_mm512_setzero_ps(); - simde_mm512_storeu_ps(mem_addr, simde_mm512_mask_mov_ps(zero, k, a)); + const simde__m512 src = simde_mm512_loadu_ps(mem_addr); + simde_mm512_storeu_ps(mem_addr, simde_mm512_mask_mov_ps(src, k, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) @@ -203,8 +429,8 @@ simde_mm512_mask_storeu_pd (void * mem_addr, simde__mmask8 k, simde__m512d a) { #if defined(SIMDE_X86_AVX512F_NATIVE) _mm512_mask_storeu_pd(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); #else - const simde__m512d zero = simde_mm512_setzero_pd(); - simde_mm512_storeu_pd(mem_addr, simde_mm512_mask_mov_pd(zero, k, a)); + const simde__m512d src = simde_mm512_loadu_pd(mem_addr); + simde_mm512_storeu_pd(mem_addr, simde_mm512_mask_mov_pd(src, k, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) diff --git a/thirdparty/simde/x86/avx512/sub.h b/thirdparty/simde/x86/avx512/sub.h index 6e44d85a7..f3ac3ef07 100644 --- a/thirdparty/simde/x86/avx512/sub.h +++ b/thirdparty/simde/x86/avx512/sub.h @@ -24,6 +24,7 @@ * 2020 Evan Nemerson * 2020 Himanshi Mathur * 2020 Hidayat Khan + * 2025 Mickey Zhu */ #if !defined(SIMDE_X86_AVX512_SUB_H) @@ -37,6 +38,36 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_mask_sub_epi16(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_mask_sub_epi16(src, k, a, b); + #else + simde__m128i r = simde_mm_sub_epi16(a, b); + return simde_mm_mask_mov_epi16(src, k, r); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_mask_sub_epi16 + #define _mm_mask_sub_epi16(src, k, a, b) simde_mm_mask_sub_epi16(src, k, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_maskz_sub_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm_maskz_sub_epi16(k, a, b); + #else + simde__m128i r = simde_mm_sub_epi16(a, b); + return simde_mm_maskz_mov_epi16(k, r); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm_maskz_sub_epi16 + #define _mm_maskz_sub_epi16(k, a, b) simde_mm_maskz_sub_epi16(k, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_sub_epi8 (simde__m512i a, simde__m512i b) { diff --git a/thirdparty/simde/x86/avx512/types.h b/thirdparty/simde/x86/avx512/types.h index 25acf9ebf..c39b993b5 100644 --- a/thirdparty/simde/x86/avx512/types.h +++ b/thirdparty/simde/x86/avx512/types.h @@ -58,6 +58,131 @@ SIMDE_BEGIN_DECLS_ # define SIMDE_AVX512_ALIGN SIMDE_ALIGN_TO_64 # endif +typedef union { + #if defined(SIMDE_VECTOR_SUBSCRIPT) + SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + #if defined(SIMDE_HAVE_INT128_) + SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; + #endif + SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_16 int8_t i8[16]; + SIMDE_ALIGN_TO_16 int16_t i16[8]; + SIMDE_ALIGN_TO_16 int32_t i32[4]; + SIMDE_ALIGN_TO_16 int64_t i64[2]; + SIMDE_ALIGN_TO_16 uint8_t u8[16]; + SIMDE_ALIGN_TO_16 uint16_t u16[8]; + SIMDE_ALIGN_TO_16 uint32_t u32[4]; + SIMDE_ALIGN_TO_16 uint64_t u64[2]; + #if defined(SIMDE_HAVE_INT128_) + SIMDE_ALIGN_TO_16 simde_int128 i128[1]; + SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; + #endif + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; + SIMDE_ALIGN_TO_16 simde_float32 f32[4]; + SIMDE_ALIGN_TO_16 simde_float64 f64[2]; + SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; + SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; + #endif + + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + SIMDE_ALIGN_TO_16 __m128h n; + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; + #endif + #endif +} simde__m128h_private; + +typedef union { + #if defined(SIMDE_VECTOR_SUBSCRIPT) + SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + #if defined(SIMDE_HAVE_INT128_) + SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_32 simde_float16 f16[16]; + #endif + SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_32 int8_t i8[32]; + SIMDE_ALIGN_TO_32 int16_t i16[16]; + SIMDE_ALIGN_TO_32 int32_t i32[8]; + SIMDE_ALIGN_TO_32 int64_t i64[4]; + SIMDE_ALIGN_TO_32 uint8_t u8[32]; + SIMDE_ALIGN_TO_32 uint16_t u16[16]; + SIMDE_ALIGN_TO_32 uint32_t u32[8]; + SIMDE_ALIGN_TO_32 uint64_t u64[4]; + #if defined(SIMDE_HAVE_INT128_) + SIMDE_ALIGN_TO_32 simde_int128 i128[2]; + SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; + #endif + SIMDE_ALIGN_TO_32 simde_float16 f16[16]; + SIMDE_ALIGN_TO_32 simde_float32 f32[8]; + SIMDE_ALIGN_TO_32 simde_float64 f64[4]; + SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)]; + SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)]; + #endif + + SIMDE_ALIGN_TO_32 simde__m128h_private m128h_private[2]; + // SIMDE_ALIGN_TO_32 simde__m128h m128h[2]; + + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + SIMDE_ALIGN_TO_32 __m256h n; + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[2]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2]; + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[2]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; + #endif + #endif +} simde__m256h_private; + typedef union { #if defined(SIMDE_VECTOR_SUBSCRIPT) SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; @@ -442,7 +567,6 @@ typedef union { #endif } simde__m512h_private; - typedef union { #if defined(SIMDE_VECTOR_SUBSCRIPT) SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; @@ -569,14 +693,24 @@ typedef union { #if defined(SIMDE_X86_AVX512FP16_NATIVE) typedef __m512h simde__m512h; + typedef __m256h simde__m256h; + typedef __m128h simde__m128h; #else #if defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_FLOAT16_VECTOR) + typedef simde_float16 simde__m128h SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + typedef simde_float16 simde__m256h SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; typedef simde_float16 simde__m512h SIMDE_AVX512_ALIGN SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; #else + typedef simde__m128h_private simde__m128h; + typedef simde__m256h_private simde__m256h; typedef simde__m512h_private simde__m512h; #endif #endif +#if !defined(__mmask8) && defined(SIMDE_ENABLE_NATIVE_ALIASES) +typedef uint8_t __mmask8; +#endif + /* These are really part of AVX-512VL / AVX-512BW (in GCC __mmask32 is * in avx512vlintrin.h and __mmask64 is in avx512bwintrin.h, in clang * both are in avx512bwintrin.h), not AVX-512F. However, we don't have @@ -595,14 +729,14 @@ typedef uint64_t simde__mmask64; #if !defined(HEDLEY_INTEL_VERSION) typedef uint16_t __mmask16; #else - #define __mmask16 uint16_t; + #define __mmask16 uint16_t #endif #endif #if !defined(__mmask32) && defined(SIMDE_ENABLE_NATIVE_ALIASES) #if !defined(HEDLEY_INTEL_VERSION) typedef uint32_t __mmask32; #else - #define __mmask32 uint32_t; + #define __mmask32 uint32_t #endif #endif #if !defined(__mmask64) && defined(SIMDE_ENABLE_NATIVE_ALIASES) @@ -613,7 +747,7 @@ typedef uint64_t simde__mmask64; typedef uint64_t __mmask64; #endif #else - #define __mmask64 uint64_t; + #define __mmask64 uint64_t #endif #endif @@ -643,18 +777,22 @@ typedef uint64_t simde__mmask64; #if !defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) #if !defined(HEDLEY_INTEL_VERSION) - //typedef simde__m128h __m128h; - //typedef simde__m256h __m256h; + typedef simde__m128h __m128h; + typedef simde__m256h __m256h; typedef simde__m512h __m512h; #else - //#define __m128h simde__m128h - //#define __m256h simde__m256h + #define __m128h simde__m128h + #define __m256h simde__m256h #define __m512h simde__m512h #endif #endif +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128h), "simde__m128h size incorrect"); +HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128h_private), "simde__m128h_private size incorrect"); HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128bh), "simde__m128bh size incorrect"); HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128bh_private), "simde__m128bh_private size incorrect"); +HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256h), "simde__m256h size incorrect"); +HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256h_private), "simde__m256h_private size incorrect"); HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256bh), "simde__m256bh size incorrect"); HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256bh_private), "simde__m256bh_private size incorrect"); HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512bh), "simde__m512bh size incorrect"); @@ -668,8 +806,12 @@ HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512d_private), "simde__m512d_private s HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512h), "simde__m512h size incorrect"); HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512h_private), "simde__m512h_private size incorrect"); #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128h) == 16, "simde__m128h is not 16-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128h_private) == 16, "simde__m128h_private is not 16-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128bh) == 16, "simde__m128bh is not 16-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128bh_private) == 16, "simde__m128bh_private is not 16-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256h) == 32, "simde__m256h is not 16-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256h_private) == 32, "simde__m256h_private is not 16-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256bh) == 32, "simde__m256bh is not 16-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256bh_private) == 32, "simde__m256bh_private is not 16-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512bh) == 32, "simde__m512bh is not 32-byte aligned"); @@ -703,6 +845,22 @@ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512h_private) == 32, "simde__m512h_p #define _MM_CMPINT_TRUE SIMDE_CMPINT_TRUE #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128h +simde__m128h_from_private(simde__m128h_private v) { + simde__m128h r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128h_private +simde__m128h_to_private(simde__m128h v) { + simde__m128h_private r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128bh simde__m128bh_from_private(simde__m128bh_private v) { @@ -719,6 +877,22 @@ simde__m128bh_to_private(simde__m128bh v) { return r; } +SIMDE_FUNCTION_ATTRIBUTES +simde__m256h +simde__m256h_from_private(simde__m256h_private v) { + simde__m256h r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256h_private +simde__m256h_to_private(simde__m256h v) { + simde__m256h_private r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + SIMDE_FUNCTION_ATTRIBUTES simde__m256bh simde__m256bh_from_private(simde__m256bh_private v) { diff --git a/thirdparty/simde/x86/clmul.h b/thirdparty/simde/x86/clmul.h index 7d51b5b3f..14be7917f 100644 --- a/thirdparty/simde/x86/clmul.h +++ b/thirdparty/simde/x86/clmul.h @@ -33,6 +33,17 @@ #if !defined(SIMDE_X86_CLMUL_H) #define SIMDE_X86_CLMUL_H +#include +#include + +#include "../hedley.h" +#include "../simde-detect-clang.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-common.h" +#include "sse2.h" +#include "avx.h" +#include "avx512/types.h" #include "avx512/set.h" #include "avx512/setzero.h" @@ -203,7 +214,7 @@ simde_mm_clmulepi64_si128 (simde__m128i a, simde__m128i b, const int imm8) #else #define simde_mm_clmulepi64_si128(a, b, imm8) _mm_clmulepi64_si128(a, b, imm8) #endif -#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) && !defined(__clang__) +#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) && !(SIMDE_DETECT_CLANG_VERSION_NOT(22,0,0)) #define simde_mm_clmulepi64_si128(a, b, imm8) \ simde__m128i_from_neon_u64( \ vreinterpretq_u64_p128( \ diff --git a/thirdparty/simde/x86/f16c.h b/thirdparty/simde/x86/f16c.h index 27828a44c..f2530c07f 100644 --- a/thirdparty/simde/x86/f16c.h +++ b/thirdparty/simde/x86/f16c.h @@ -24,13 +24,19 @@ * 2021 Evan Nemerson */ -#include "../simde-common.h" -#include "../simde-math.h" -#include "../simde-f16.h" + #if !defined(SIMDE_X86_F16C_H) #define SIMDE_X86_F16C_H +#include +#include "../hedley.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-common.h" +#include "../simde-f16.h" +#include "sse.h" +#include "sse2.h" #include "avx.h" #if !defined(SIMDE_X86_PF16C_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) @@ -47,23 +53,68 @@ simde_mm_cvtps_ph(simde__m128 a, const int imm8) { simde__m128_private a_ = simde__m128_to_private(a); simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - HEDLEY_STATIC_CAST(void, imm8); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0))); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcvt_h_s((v4f32)__lsx_vreplgr2vr_w(0), a_.lsx_f32); - #elif defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f16[i] = simde_float16_from_float32(a_.f32[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); - } - #endif + switch (imm8 & ~SIMDE_MM_FROUND_NO_EXC) { + case SIMDE_MM_FROUND_CUR_DIRECTION: /* assumes current mode is half-to-even */ + case SIMDE_MM_FROUND_TO_NEAREST_INT: + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_NEON_FP16) + r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcvt_h_s((v4f32)__lsx_vreplgr2vr_w(0), a_.lsx_f32); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_float16_from_float32(a_.f32[i]); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); + } + #endif + break; + + case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_NEG_INF); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_NEG_INF)); + } + #endif + break; + + case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_POS_INF); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_POS_INF)); + } + #endif + break; + + case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_ZERO); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_ZERO)); + } + #endif + break; + } return simde__m128i_from_private(r_); } @@ -85,7 +136,7 @@ simde_mm_cvtph_ps(simde__m128i a) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_f32 = __lsx_vfcvtl_s_h(a_.lsx_i64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_NEON_FP16) r_.neon_f32 = vcvt_f32_f16(vget_low_f16(a_.neon_f16)); #elif defined(SIMDE_FLOAT16_VECTOR) SIMDE_VECTORIZE @@ -112,24 +163,68 @@ simde_mm256_cvtps_ph(simde__m256 a, const int imm8) { simde__m256_private a_ = simde__m256_to_private(a); simde__m128i_private r_; - HEDLEY_STATIC_CAST(void, imm8); + switch (imm8 & ~SIMDE_MM_FROUND_NO_EXC) { + case SIMDE_MM_FROUND_CUR_DIRECTION: /* assumes current mode is half-to-even */ + case SIMDE_MM_FROUND_TO_NEAREST_INT: + #if defined(SIMDE_LOONGARCH_LASX_NATIVE) + a_.i256 = __lasx_xvfcvt_h_s(a_.f256, a_.f256); + a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); + r_.lsx_i64 = simde_mm256_extractf128_si256(a_.i256, 0); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_float16_from_float32(a_.f32[i]); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); + } + #endif + break; - #if defined(SIMDE_LOONGARCH_LASX_NATIVE) - a_.i256 = __lasx_xvfcvt_h_s(a_.f256, a_.f256); - a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); - r_.lsx_i64 = simde_mm256_extractf128_si256(a_.i256, 0); - #elif defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f16[i] = simde_float16_from_float32(a_.f32[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); - } - #endif + case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_NEG_INF); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_NEG_INF)); + } + #endif + break; + + case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_POS_INF); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_POS_INF)); + } + #endif + break; + case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_ZERO); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_x_float16_from_float32(a_.f32[i], SIMDE_F16_ROUND_TO_ZERO)); + } + #endif + break; + } return simde__m128i_from_private(r_); } diff --git a/thirdparty/simde/x86/fma.h b/thirdparty/simde/x86/fma.h index bb174284b..c9238a7e4 100644 --- a/thirdparty/simde/x86/fma.h +++ b/thirdparty/simde/x86/fma.h @@ -27,6 +27,16 @@ #if !defined(SIMDE_X86_FMA_H) #define SIMDE_X86_FMA_H +#include + +#include "../hedley.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-math.h" +#include "../simde-common.h" +#include "sse.h" +#include "sse2.h" +#include "sse3.h" #include "avx.h" #if !defined(SIMDE_X86_FMA_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) @@ -82,6 +92,16 @@ simde_mm256_fmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { return _mm256_fmadd_pd(a, b, c); #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) return __lasx_xvfmadd_d(a, b, c); + #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + simde__m256d_private + r_, + a_ = simde__m256d_to_private(a), + b_ = simde__m256d_to_private(b), + c_ = simde__m256d_to_private(c); + + r_.m128d[0] = simde_mm_fmadd_pd(a_.m128d[0], b_.m128d[0], c_.m128d[0]); + r_.m128d[1] = simde_mm_fmadd_pd(a_.m128d[1], b_.m128d[1], c_.m128d[1]); + return simde__m256d_from_private(r_); #else return simde_mm256_add_pd(simde_mm256_mul_pd(a, b), c); #endif diff --git a/thirdparty/simde/x86/gfni.h b/thirdparty/simde/x86/gfni.h index 5982a3409..4d8425cc6 100644 --- a/thirdparty/simde/x86/gfni.h +++ b/thirdparty/simde/x86/gfni.h @@ -1,4 +1,6 @@ -/* Permission is hereby granted, free of charge, to any person +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, @@ -20,12 +22,26 @@ * * Copyright: * 2020-2021 Christopher Moore - * 2020 Evan Nemerson + * 2020-2022 Evan Nemerson + * 2023 Michael R. Crusoe */ #if !defined(SIMDE_X86_GFNI_H) #define SIMDE_X86_GFNI_H +#include +#include + +#include "../hedley.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-common.h" +#include "sse2.h" +#include "sse4.1.h" +#include "ssse3.h" +#include "avx.h" +#include "avx2.h" +#include "avx512/types.h" #include "avx512/add.h" #include "avx512/and.h" #include "avx512/broadcast.h" diff --git a/thirdparty/simde/x86/mmx.h b/thirdparty/simde/x86/mmx.h index e294af8e9..fe4914a20 100644 --- a/thirdparty/simde/x86/mmx.h +++ b/thirdparty/simde/x86/mmx.h @@ -1379,33 +1379,28 @@ simde_mm_sll_pi16 (simde__m64 a, simde__m64 count) { #if defined(SIMDE_X86_MMX_NATIVE) return _mm_sll_pi16(a, count); #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private r_, a_; simde__m64_private count_ = simde__m64_to_private(count); + if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { + return simde_mm_setzero_si64(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m64_to_private(a); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) HEDLEY_DIAGNOSTIC_PUSH #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) #pragma clang diagnostic ignored "-Wvector-conversion" #endif - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(HEDLEY_STATIC_CAST(int16_t, vget_lane_u64(count_.neon_u64, 0)))); + r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(HEDLEY_STATIC_CAST(int16_t, cnt))); HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) - return simde_mm_setzero_si64(); - - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count_.u64[0]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << count_.u64[0]; + r_.u16 = a_.u16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(16, cnt); #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count_.u64[0]); + r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << cnt); } #endif @@ -1424,28 +1419,28 @@ simde_mm_sll_pi32 (simde__m64 a, simde__m64 count) { #if defined(SIMDE_X86_MMX_NATIVE) return _mm_sll_pi32(a, count); #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private r_, a_; simde__m64_private count_ = simde__m64_to_private(count); + if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { + return simde_mm_setzero_si64(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m64_to_private(a); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) HEDLEY_DIAGNOSTIC_PUSH #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) #pragma clang diagnostic ignored "-Wvector-conversion" #endif - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(HEDLEY_STATIC_CAST(int32_t, vget_lane_u64(count_.neon_u64, 0)))); + r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(HEDLEY_STATIC_CAST(int32_t, cnt))); HEDLEY_DIAGNOSTIC_POP #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << count_.u64[0]; + r_.u32 = a_.u32 << SIMDE_CAST_VECTOR_SHIFT_COUNT(32, cnt); #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] << count_.u64[0]; + r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i] << cnt); } #endif @@ -1557,21 +1552,21 @@ simde_mm_sll_si64 (simde__m64 a, simde__m64 count) { #if defined(SIMDE_X86_MMX_NATIVE) return _mm_sll_si64(a, count); #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private r_, a_; simde__m64_private count_ = simde__m64_to_private(count); + if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { + return simde_mm_setzero_si64(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m64_to_private(a); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64); + r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64(HEDLEY_STATIC_CAST(int64_t, cnt))); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 << count_.i64; + r_.u64 = a_.u64 << SIMDE_CAST_VECTOR_SHIFT_COUNT(64, cnt); #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - r_.u64[0] = a_.u64[0] << count_.u64[0]; + r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u64[0] << cnt); #endif return simde__m64_from_private(r_); @@ -1589,28 +1584,23 @@ simde_mm_srl_pi16 (simde__m64 a, simde__m64 count) { #if defined(SIMDE_X86_MMX_NATIVE) return _mm_srl_pi16(a, count); #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private r_, a_; simde__m64_private count_ = simde__m64_to_private(count); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) - return simde_mm_setzero_si64(); + if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { + return simde_mm_setzero_si64(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m64_to_private(a); - r_.u16 = a_.u16 >> HEDLEY_STATIC_CAST(uint16_t, count_.u64[0]); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> count_.u64[0]; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t) vget_lane_u64(count_.neon_u64, 0)))); + r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, cnt); #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - SIMDE_VECTORIZE for (size_t i = 0 ; i < sizeof(r_.u16) / sizeof(r_.u16[0]) ; i++) { - r_.u16[i] = a_.u16[i] >> count_.u64[0]; + r_.u16[i] = a_.u16[i] >> cnt; } #endif @@ -1629,23 +1619,23 @@ simde_mm_srl_pi32 (simde__m64 a, simde__m64 count) { #if defined(SIMDE_X86_MMX_NATIVE) return _mm_srl_pi32(a, count); #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private r_, a_; simde__m64_private count_ = simde__m64_to_private(count); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> count_.u64[0]; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t) vget_lane_u64(count_.neon_u64, 0)))); - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } + if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { + return simde_mm_setzero_si64(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m64_to_private(a); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, cnt); + #else SIMDE_VECTORIZE for (size_t i = 0 ; i < sizeof(r_.u32) / sizeof(r_.u32[0]) ; i++) { - r_.u32[i] = a_.u32[i] >> count_.u64[0]; + r_.u32[i] = a_.u32[i] >> cnt; } #endif @@ -1752,21 +1742,21 @@ simde_mm_srl_si64 (simde__m64 a, simde__m64 count) { #if defined(SIMDE_X86_MMX_NATIVE) return _mm_srl_si64(a, count); #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); + simde__m64_private r_, a_; simde__m64_private count_ = simde__m64_to_private(count); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64)); + if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { + return simde_mm_setzero_si64(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m64_to_private(a); + + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt))); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = a_.u64 >> count_.u64; + r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(64, cnt); #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - r_.u64[0] = a_.u64[0] >> count_.u64[0]; + r_.u64[0] = a_.u64[0] >> cnt; #endif return simde__m64_from_private(r_); @@ -1849,12 +1839,12 @@ simde_mm_sra_pi16 (simde__m64 a, simde__m64 count) { simde__m64_private r_; simde__m64_private a_ = simde__m64_to_private(a); simde__m64_private count_ = simde__m64_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0])); + const int cnt = count_.u64[0] > 15 ? 15 : HEDLEY_STATIC_CAST(int, count_.u64[0]); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> cnt; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, vget_lane_u64(count_.neon_u64, 0)))); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i16 = a_.i16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -1880,12 +1870,12 @@ simde_mm_sra_pi32 (simde__m64 a, simde__m64 count) { simde__m64_private r_; simde__m64_private a_ = simde__m64_to_private(a); simde__m64_private count_ = simde__m64_to_private(count); - const int32_t cnt = (count_.u64[0] > 31) ? 31 : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]); + const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> cnt; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, vget_lane_u64(count_.neon_u64, 0)))); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { diff --git a/thirdparty/simde/x86/sse.h b/thirdparty/simde/x86/sse.h index 110142ca9..6c3f8272a 100644 --- a/thirdparty/simde/x86/sse.h +++ b/thirdparty/simde/x86/sse.h @@ -974,10 +974,10 @@ simde_mm_and_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 & b_.i32; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = a_.i32 & b_.i32; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -1307,7 +1307,7 @@ simde_mm_cmpeq_ps (simde__m128 a, simde__m128 b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? UINT32_MAX : UINT32_C(0); } #endif @@ -1336,7 +1336,7 @@ simde_mm_cmpeq_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32), 0); #else - r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? UINT32_MAX : UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.u32[i] = a_.u32[i]; @@ -1373,7 +1373,7 @@ simde_mm_cmpge_ps (simde__m128 a, simde__m128 b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? UINT32_MAX : UINT32_C(0); } #endif @@ -1402,7 +1402,7 @@ simde_mm_cmpge_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32), 0); #else - r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? UINT32_MAX : UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.u32[i] = a_.u32[i]; @@ -1439,7 +1439,7 @@ simde_mm_cmpgt_ps (simde__m128 a, simde__m128 b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? UINT32_MAX : UINT32_C(0); } #endif @@ -1468,7 +1468,7 @@ simde_mm_cmpgt_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32), 0); #else - r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? UINT32_MAX : UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.u32[i] = a_.u32[i]; @@ -1505,7 +1505,7 @@ simde_mm_cmple_ps (simde__m128 a, simde__m128 b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? UINT32_MAX : UINT32_C(0); } #endif @@ -1534,7 +1534,7 @@ simde_mm_cmple_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32), 0); #else - r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? UINT32_MAX : UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.u32[i] = a_.u32[i]; @@ -1572,7 +1572,7 @@ simde_mm_cmplt_ps (simde__m128 a, simde__m128 b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? UINT32_MAX : UINT32_C(0); } #endif @@ -1601,7 +1601,7 @@ simde_mm_cmplt_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32), 0); #else - r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? UINT32_MAX : UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.u32[i] = a_.u32[i]; @@ -1640,7 +1640,7 @@ simde_mm_cmpneq_ps (simde__m128 a, simde__m128 b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? UINT32_MAX : UINT32_C(0); } #endif @@ -1669,7 +1669,7 @@ simde_mm_cmpneq_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_f32, __lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32), 0); #else - r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? UINT32_MAX : UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.u32[i] = a_.u32[i]; @@ -1785,7 +1785,7 @@ simde_mm_cmpord_ps (simde__m128 a, simde__m128 b) { #elif defined(simde_math_isnanf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? UINT32_C(0) : ~UINT32_C(0); + r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? UINT32_C(0) : UINT32_MAX; } #else HEDLEY_UNREACHABLE(); @@ -1825,11 +1825,12 @@ simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) { vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); + r_.lsx_i32 = HEDLEY_REINTERPRET_CAST(v4i32, __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32)); + // TODO: change when https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123759 is resolved #elif defined(simde_math_isnanf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? UINT32_MAX : UINT32_C(0); } #else HEDLEY_UNREACHABLE(); @@ -1859,7 +1860,7 @@ simde_mm_cmpunord_ss (simde__m128 a, simde__m128 b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vor_v(__lsx_vfcmp_cune_s(a_.lsx_f32, a_.lsx_f32), __lsx_vfcmp_cune_s(b_.lsx_f32, b_.lsx_f32)), 0); #elif defined(simde_math_isnanf) - r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0); + r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? UINT32_MAX : UINT32_C(0); SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { r_.u32[i] = a_.u32[i]; @@ -2687,7 +2688,7 @@ simde_mm_cmpord_ss (simde__m128 a, simde__m128 b) { __m128i temp = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, __lsx_vnor_v(temp, temp), 0); #elif defined(simde_math_isnanf) - r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) || simde_math_isnanf(simde_mm_cvtss_f32(b))) ? UINT32_C(0) : ~UINT32_C(0); + r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) || simde_math_isnanf(simde_mm_cvtss_f32(b))) ? UINT32_C(0) : UINT32_MAX; SIMDE_VECTORIZE for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { r_.u32[i] = a_.u32[i]; @@ -3119,6 +3120,15 @@ simde_mm_max_ps (simde__m128 a, simde__m128 b) { r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) r_.lsx_f32 = __lsx_vfmax_s(a_.lsx_f32, b_.lsx_f32); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + uint32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 > b_.f32); + r_.f32 = + HEDLEY_REINTERPRET_CAST( + __typeof__(r_.f32), + ( (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32) & m) | + (HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f32) & ~m) + ) + ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -3236,17 +3246,19 @@ simde_mm_min_ps (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_NANS) r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_f32 = vbslq_f32(vcltq_f32(a_.neon_f32, b_.neon_f32), a_.neon_f32, b_.neon_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS) + r_.wasm_v128 = wasm_f32x4_min(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_pmin(b_.wasm_v128, a_.wasm_v128); + r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128)); + #elif (defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)) && defined(SIMDE_FAST_NANS) + r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - #if defined(SIMDE_FAST_NANS) - r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32); - #else - r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(b_.altivec_f32, a_.altivec_f32)); - #endif - #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmplt(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) r_.lsx_f32 = __lsx_vfmin_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) uint32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 < b_.f32); @@ -3504,12 +3516,12 @@ simde_mm_mul_ps (simde__m128 a, simde__m128 b) { r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 * b_.f32; #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_mul(a_.altivec_f32, b_.altivec_f32); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_f32 = __lsx_vfmul_s(a_.lsx_f32, b_.lsx_f32); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f32 = a_.f32 * b_.f32; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -4089,9 +4101,16 @@ simde_mm_setzero_ps (void) { #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) return (simde__m128)__lsx_vreplgr2vr_w(0); #else - simde__m128 r; - simde_memset(&r, 0, sizeof(r)); - return r; + simde__m128_private r_; + #if defined(SIMDE_VECTOR_SUBSCRIPT) + r_.f32 = __extension__ (__typeof__(r_.f32)) { 0.0f, 0.0f, 0.0f, 0.0f }; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = 0.0f; + } + #endif + return simde__m128_from_private(r_); #endif } #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) @@ -4103,18 +4122,29 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ #endif +#if defined(SIMDE_X86_SSE_NATIVE) +# if defined(__has_builtin) +# if __has_builtin(__builtin_ia32_undef128) +# define SIMDE_HAVE_UNDEFINED128 +# endif +# elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && !defined(_MSC_VER) +# define SIMDE_HAVE_UNDEFINED128 +# endif +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_undefined_ps (void) { - simde__m128_private r_; #if defined(SIMDE_HAVE_UNDEFINED128) - r_.n = _mm_undefined_ps(); + return _mm_undefined_ps(); #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m128_to_private(simde_mm_setzero_ps()); + return simde_mm_setzero_ps(); + #else + simde__m128_private r_; + return simde__m128_from_private(r_); #endif - return simde__m128_from_private(r_); } #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) # define _mm_undefined_ps() simde_mm_undefined_ps() @@ -4823,16 +4853,6 @@ simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) { # define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b)) #endif -#if defined(SIMDE_X86_SSE_NATIVE) -# if defined(__has_builtin) -# if __has_builtin(__builtin_ia32_undef128) -# define SIMDE_HAVE_UNDEFINED128 -# endif -# elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && !defined(_MSC_VER) -# define SIMDE_HAVE_UNDEFINED128 -# endif -#endif - SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) { diff --git a/thirdparty/simde/x86/sse2.h b/thirdparty/simde/x86/sse2.h index a2d0faf7c..395d3919f 100644 --- a/thirdparty/simde/x86/sse2.h +++ b/thirdparty/simde/x86/sse2.h @@ -424,6 +424,137 @@ simde__m128d_to_private(simde__m128d v) { SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v2f64, lsx, f64) #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128d +simde_x_mm_round_pd (simde__m128d a, int rounding, int lax_rounding) + SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lax_rounding, 0, 1) { + simde__m128d_private + r_, + a_ = simde__m128d_to_private(a); + + (void) lax_rounding; + + /* For architectures which lack a current direction SIMD instruction. + * + * Note that NEON actually has a current rounding mode instruction, + * but in ARMv8+ the rounding mode is ignored and nearest is always + * used, so we treat ARMv7 as having a rounding mode but ARMv8 as + * not. */ + #if \ + defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ARM_NEON_A32V8) + if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) + rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; + #endif + + switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { + case SIMDE_MM_FROUND_CUR_DIRECTION: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndiq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); + #elif defined(simde_math_nearbyint) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_nearbyint(a_.f64[i]); + } + #else + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + #endif + break; + + case SIMDE_MM_FROUND_TO_NEAREST_INT: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_rint(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndnq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); + #elif defined(simde_math_roundeven) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_roundeven(a_.f64[i]); + } + #else + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + #endif + break; + + case SIMDE_MM_FROUND_TO_NEG_INF: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndmq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_floor(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrm_d(a_.lsx_f64); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_floor(a_.f64[i]); + } + #endif + break; + + case SIMDE_MM_FROUND_TO_POS_INF: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndpq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_ceil(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrp_d(a_.lsx_f64); + #elif defined(simde_math_ceil) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_ceil(a_.f64[i]); + } + #else + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + #endif + break; + + case SIMDE_MM_FROUND_TO_ZERO: + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64)); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vrndq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_trunc(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f64 = __lsx_vfrintrz_d(a_.lsx_f64); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = simde_math_trunc(a_.f64[i]); + } + #endif + break; + + default: + HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); + } + + return simde__m128d_from_private(r_); +} +#if defined(SIMDE_X86_SSE4_1_NATIVE) + #define simde_mm_round_pd(a, rounding) _mm_round_pd((a), (rounding)) +#else + #define simde_mm_round_pd(a, rounding) simde_x_mm_round_pd((a), (rounding), 0) +#endif +#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) + #define _mm_round_pd(a, rounding) simde_mm_round_pd((a), (rounding)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { @@ -437,7 +568,7 @@ simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; r_.neon_f64 = vld1q_f64(data); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -571,12 +702,12 @@ simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) { b_ = simde__m128d_to_private(b), mask_ = simde__m128d_to_private(mask); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask_.lsx_u64) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, (__m128i)mask_.lsx_u64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -1171,14 +1302,14 @@ simde_mm_xor_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) + #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vxor_v(a_.lsx_i64, b_.lsx_i64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f ^ b_.i32f; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { @@ -1344,7 +1475,7 @@ simde_mm_bslli_si128 (simde__m128i a, const int imm8) #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8) #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) #define simde_mm_bslli_si128(a, imm8) \ - (((imm8)<=0) ? (a) : (((imm8)>15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i8((v16i8)__lsx_vbsll_v(simde__m128i_to_private(a).lsx_i64, (imm8))))) + (((imm8)<=0) ? (simde__m128i)(a) : (((imm8)>15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i8((v16i8)__lsx_vbsll_v(simde__m128i_to_private(a).lsx_i64, (imm8))))) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) #define simde_mm_bslli_si128(a, imm8) \ simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8))))) @@ -1442,7 +1573,7 @@ simde_mm_bsrli_si128 (simde__m128i a, const int imm8) #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8) #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) #define simde_mm_bsrli_si128(a, imm8) \ - (((imm8)<=0) ? (a) : (((imm8)>15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i8((v16i8)__lsx_vbsrl_v(simde__m128i_to_private(a).lsx_i64, (imm8))))) + (((imm8)<=0) ? (simde__m128i)(a) : (((imm8)>15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i8((v16i8)__lsx_vbsrl_v(simde__m128i_to_private(a).lsx_i64, (imm8))))) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) #define simde_mm_bsrli_si128(a, imm8) \ simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15)))) @@ -1867,6 +1998,68 @@ simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) { #define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpneq_epi8 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_ne(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 != b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] != b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpeq_epu8 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), (a_.u8 == b_.u8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { + r_.u8[i] = (a_.u8[i] == b_.u8[i]) ? UINT8_MAX : UINT8_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpneq_epu8 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), (a_.u8 != b_.u8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { + r_.u8[i] = (a_.u8[i] != b_.u8[i]) ? UINT8_MAX : UINT8_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) { @@ -1902,6 +2095,67 @@ simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) { #define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpneq_epi16 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_ne(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = (a_.i16 != b_.i16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] != b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpeq_epu16 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 == b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] == b_.u16[i]) ? UINT16_MAX : UINT16_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpneq_epu16 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 != b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] != b_.u16[i]) ? UINT16_MAX : UINT16_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) { @@ -1937,6 +2191,71 @@ simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) { #define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_mm_x_cmpneq_epi32 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_ne(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32)); + r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_nor(r_.altivec_i32, r_.altivec_i32)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 != b_.i32); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] != b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpeq_epu32 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 == b_.u32); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] == b_.u32[i]) ? UINT32_MAX : UINT32_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpneq_epu32 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 != b_.u32); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] != b_.u32[i]) ? UINT32_MAX : UINT32_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) { @@ -1963,7 +2282,7 @@ simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? UINT64_MAX : UINT64_C(0); } #endif @@ -1992,7 +2311,7 @@ simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_ceq_d(a_.lsx_f64, b_.lsx_f64), 0); #else - r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0; + r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? UINT64_MAX : UINT64_C(0); r_.u64[1] = a_.u64[1]; #endif return simde__m128d_from_private(r_); @@ -2024,7 +2343,7 @@ simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? UINT64_MAX : UINT64_C(0); } #endif @@ -2052,7 +2371,7 @@ simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_cune_d(a_.lsx_f64, b_.lsx_f64), 0); #else - r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? UINT64_MAX : UINT64_C(0); r_.u64[1] = a_.u64[1]; #endif return simde__m128d_from_private(r_); @@ -2191,7 +2510,7 @@ simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? UINT64_MAX : UINT64_C(0); } #endif @@ -2220,7 +2539,7 @@ simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_clt_d(a_.lsx_f64, b_.lsx_f64), 0); #else - r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? UINT64_MAX : UINT64_C(0); r_.u64[1] = a_.u64[1]; #endif return simde__m128d_from_private(r_); @@ -2241,9 +2560,7 @@ simde_mm_cmple_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_u64 = vcleq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128); @@ -2251,10 +2568,12 @@ simde_mm_cmple_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? UINT64_MAX : UINT64_C(0); } #endif @@ -2282,7 +2601,7 @@ simde_mm_cmple_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_cle_d(a_.lsx_f64, b_.lsx_f64), 0); #else - r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? UINT64_MAX : UINT64_C(0); r_.u64[1] = a_.u64[1]; #endif return simde__m128d_from_private(r_); @@ -2408,9 +2727,7 @@ simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_u64 = vcgtq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128); @@ -2418,10 +2735,12 @@ simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? UINT64_MAX : UINT64_C(0); } #endif @@ -2449,7 +2768,7 @@ simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_clt_d(b_.lsx_f64, a_.lsx_f64), 0); #else - r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? UINT64_MAX : UINT64_C(0); r_.u64[1] = a_.u64[1]; #endif return simde__m128d_from_private(r_); @@ -2470,9 +2789,7 @@ simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_u64 = vcgeq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128); @@ -2480,10 +2797,12 @@ simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) { r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? UINT64_MAX : UINT64_C(0); } #endif @@ -2512,7 +2831,7 @@ simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, (__m128i)__lsx_vfcmp_cle_d(b_.lsx_f64, a_.lsx_f64), 0); #else - r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? UINT64_MAX : UINT64_C(0); r_.u64[1] = a_.u64[1]; #endif return simde__m128d_from_private(r_); @@ -2653,7 +2972,7 @@ simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) { #elif defined(simde_math_isnan) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? UINT64_MAX : UINT64_C(0); } #else HEDLEY_UNREACHABLE(); @@ -2708,7 +3027,7 @@ simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) { r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vand_v(__lsx_vfcmp_ceq_d(a_.lsx_f64, a_.lsx_f64), __lsx_vfcmp_ceq_d(b_.lsx_f64, b_.lsx_f64)), 0); #elif defined(simde_math_isnan) - r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? UINT64_MAX : UINT64_C(0); r_.u64[1] = a_.u64[1]; #else HEDLEY_UNREACHABLE(); @@ -2745,7 +3064,7 @@ simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) { #elif defined(simde_math_isnan) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? UINT64_MAX : UINT64_C(0); } #else HEDLEY_UNREACHABLE(); @@ -2776,7 +3095,7 @@ simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vextrins_d(a_.lsx_i64, __lsx_vor_v(__lsx_vfcmp_cune_d(a_.lsx_f64, a_.lsx_f64), __lsx_vfcmp_cune_d(b_.lsx_f64, b_.lsx_f64)), 0); #elif defined(simde_math_isnan) - r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? UINT64_MAX : UINT64_C(0); r_.u64[1] = a_.u64[1]; #else HEDLEY_UNREACHABLE(); @@ -2863,8 +3182,9 @@ simde_mm_cvtpd_pi32 (simde__m128d a) { return _mm_cvtpd_pi32(a); #else simde__m64_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); + simde__m128d_private a_; + a_ = simde__m128d_to_private(simde_x_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { simde_float64 v = simde_math_round(a_.f64[i]); @@ -2993,7 +3313,7 @@ simde_mm_cvtps_epi32 (simde__m128 a) { r_.wasm_v128 = wasm_i32x4_trunc_sat_f32x4(a_.wasm_v128); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) a_ = simde__m128_to_private(a); - r_.lsx_i32 = __lsx_vftintrne_w_s(a_.lsx_f32); + r_.lsx_i32 = HEDLEY_REINTERPRET_CAST(v4i32, __lsx_vftintrne_w_s(a_.lsx_f32)); #else a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1)); SIMDE_VECTORIZE @@ -3413,7 +3733,7 @@ simde_mm_cvttpd_epi32 (simde__m128d a) { #if defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) const v2f64 zero_f64 = {-0.0f, -0.0f}; - r_.lsx_i64 = __lsx_vftintrz_w_d(zero_i64, simde__m128d_to_private(a).lsx_f64); + r_.lsx_i64 = __lsx_vftintrz_w_d(zero_f64, simde__m128d_to_private(a).lsx_f64); #else r_.m64[0] = simde_mm_cvttpd_pi32(a); r_.m64[1] = simde_mm_setzero_si64(); @@ -3473,7 +3793,7 @@ simde_mm_cvttps_epi32 (simde__m128 a) { r_.wasm_v128 = wasm_v128_bitselect(r_.wasm_v128, wasm_i32x4_splat(INT32_MIN), valid_input); #endif #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __m128i temp = __lsx_vftintrz_w_s(a_.lsx_f32); + r_.lsx_i64 = __lsx_vftintrz_w_s(a_.lsx_f32); #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) simde_float32 f1 = 2147483648.0f; @@ -3489,7 +3809,7 @@ simde_mm_cvttps_epi32 (simde__m128 a) { __m128i valid_input = __lsx_vfcmp_ceq_s(a_.lsx_f32, a_.lsx_f32); #endif - r_.lsx_i64 = __lsx_vbitsel_v(__lsx_vreplgr2vr_w(INT32_MIN), temp, valid_input); + r_.lsx_i64 = __lsx_vbitsel_v(__lsx_vreplgr2vr_w(INT32_MIN), r_.lsx_i64, valid_input); #endif #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_ARCH_POWER) SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); @@ -3586,14 +3906,12 @@ simde_mm_div_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 / b_.f64; - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfdiv_d(b_.lsx_f64, a_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f64 = a_.f64 / b_.f64; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -3793,6 +4111,11 @@ simde_mm_load_si128 (simde__m128i const* mem_addr) { #define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { @@ -3824,6 +4147,10 @@ simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { #define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadl_epi64 (simde__m128i const* mem_addr) { @@ -3964,6 +4291,16 @@ simde_mm_loadu_epi8(void const * mem_addr) { #define _mm_loadu_epi8(a) simde_mm_loadu_epi8(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_loadu_epu8(void const * mem_addr) { + simde__m128i_private r_; + + simde_memcpy(&r_, mem_addr, sizeof(r_)); + + return simde__m128i_from_private(r_); +} + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) @@ -3995,6 +4332,14 @@ simde_mm_loadu_epi16(void const * mem_addr) { #define _mm_loadu_epi16(a) simde_mm_loadu_epi16(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_loadu_epu16(void const * mem_addr) { + simde__m128i_private r_; + simde_memcpy(&r_, mem_addr, sizeof(r_)); + return simde__m128i_from_private(r_); +} + #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ && !defined(SIMDE_BUG_CLANG_REV_344862) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) #define simde_mm_loadu_epi32(mem_addr) _mm_loadu_epi32(mem_addr) @@ -4025,6 +4370,14 @@ simde_mm_loadu_epi32(void const * mem_addr) { #define _mm_loadu_epi32(a) simde_mm_loadu_epi32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_loadu_epu32(void const * mem_addr) { + simde__m128i_private r_; + simde_memcpy(&r_, mem_addr, sizeof(r_)); + return simde__m128i_from_private(r_); +} + #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ && !defined(SIMDE_BUG_CLANG_REV_344862) \ && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) @@ -4056,6 +4409,14 @@ simde_mm_loadu_epi64(void const * mem_addr) { #define _mm_loadu_epi64(a) simde_mm_loadu_epi64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_loadu_epu64(void const * mem_addr) { + simde__m128i_private r_; + simde_memcpy(&r_, mem_addr, sizeof(r_)); + return simde__m128i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_si128 (void const* mem_addr) { @@ -4114,6 +4475,9 @@ simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_mule(a_.altivec_i16, b_.altivec_i16) + vec_mulo(a_.altivec_i16, b_.altivec_i16); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_dot_i16x8(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __m128i temp_ev = __lsx_vmulwev_w_h(a_.lsx_i64, b_.lsx_i64); + r_.lsx_i64 = __lsx_vmaddwod_w_h(temp_ev, a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) int32_t SIMDE_VECTOR(32) a32, b32, p32; SIMDE_CONVERT_VECTOR_(a32, a_.i16); @@ -4122,9 +4486,6 @@ simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) { r_.i32 = __builtin_shufflevector(p32, p32, 0, 2, 4, 6) + __builtin_shufflevector(p32, p32, 1, 3, 5, 7); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __m128i temp_ev = __lsx_vmulwev_w_h(a_.lsx_i64, b_.lsx_i64); - r_.lsx_i64 = __lsx_vmaddwod_w_h(temp_ev, a_.lsx_i64, b_.lsx_i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) { @@ -4393,14 +4754,29 @@ simde_mm_min_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_NANS) r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vbslq_f64(vcltq_f64(a_.neon_f64, b_.neon_f64), a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS) r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128)); + #elif (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) && defined(SIMDE_FAST_NANS) + r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64); + #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = vec_sel(b_.altivec_f64, a_.altivec_f64, vec_cmplt(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) r_.lsx_f64 = __lsx_vfmin_d(a_.lsx_f64, b_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + uint64_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64 < b_.f64); + r_.f64 = + HEDLEY_REINTERPRET_CAST( + __typeof__(r_.f64), + ( (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64) & m) | + (HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f64) & ~m) + ) + ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -4524,14 +4900,29 @@ simde_mm_max_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_NANS) r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_f64 = vbslq_f64(vcgtq_f64(a_.neon_f64, b_.neon_f64), a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS) + r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128)); + #elif (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) && defined(SIMDE_FAST_NANS) + r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64); + #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_f64 = vec_sel(b_.altivec_f64, a_.altivec_f64, vec_cmpgt(a_.altivec_f64, b_.altivec_f64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) r_.lsx_f64 = __lsx_vfmax_d(a_.lsx_f64, b_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + uint64_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64 > b_.f64); + r_.f64 = + HEDLEY_REINTERPRET_CAST( + __typeof__(r_.f64), + ( (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64) & m) | + (HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f64) & ~m) + ) + ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -4704,14 +5095,14 @@ simde_mm_mul_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 * b_.f64; - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_f64 = __lsx_vfmul_d(a_.lsx_f64, b_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f64 = a_.f64 * b_.f64; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -4918,14 +5309,14 @@ simde_mm_or_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) + #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vor_v(a_.lsx_i64, b_.lsx_i64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32f = a_.i32f | b_.i32f; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { @@ -5144,7 +5535,11 @@ simde_mm_pause (void) { __asm__ __volatile__("isb\n"); #endif #elif defined(SIMDE_ARCH_POWER) - __asm__ __volatile__ ("or 27,27,27" ::: "memory"); + #if defined(__APPLE__) + __asm__ __volatile__ ("or r27,r27,r27" ::: "memory"); + #else + __asm__ __volatile__ ("or 27,27,27" ::: "memory"); + #endif #elif defined(SIMDE_ARCH_WASM) __asm__ __volatile__ ("nop"); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) @@ -5310,8 +5705,6 @@ simde_mm_loadu_si16 (void const* mem_addr) { HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ HEDLEY_GCC_VERSION_CHECK(12,1,0)) return _mm_loadu_si16(mem_addr); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) - return __lsx_vld(mem_addr, 0); #else int16_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5366,8 +5759,10 @@ simde_mm_loadu_si32 (void const* mem_addr) { simde__m128i_private r_; r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0); return simde__m128i_from_private(r_); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) - return __lsx_vld(mem_addr, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i_private r_; + r_.lsx_i64 = __lsx_vbsrl_v(__lsx_vldrepl_w(mem_addr, 0), 12); + return simde__m128i_from_private(r_); #else int32_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -5388,7 +5783,7 @@ simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1)); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_TO_16 simde__m64 data[2] = {e0, e1}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5416,7 +5811,7 @@ simde_mm_set_epi64x (int64_t e1, int64_t e0) { r_.neon_i64 = vld1q_s64(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i64x2_make(e0, e1); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v2i64) int64_t data[2] = {e0, e1}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5439,10 +5834,12 @@ simde_mm_loadu_si64 (void const* mem_addr) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) return _mm_loadu_si64(mem_addr); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) - return __lsx_vld(mem_addr, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + simde__m128i_private r_; + r_.lsx_i64 = __lsx_vbsrl_v(__lsx_vldrepl_d(mem_addr, 0), 8); + return simde__m128i_from_private(r_); #else - int64_t val; + int64_t val; simde_memcpy(&val, mem_addr, sizeof(val)); return simde_mm_cvtsi64_si128(val); #endif @@ -5475,7 +5872,7 @@ simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, r_.neon_u8 = vld1q_u8(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v16u8) uint8_t data[16] = { e0, e1, e2, e3, e4, e5, e6, e7, @@ -5509,7 +5906,7 @@ simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, r_.neon_u16 = vld1q_u16(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v8u16) uint16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5535,7 +5932,7 @@ simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { r_.neon_u32 = vld1q_u32(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u32x4_make(e0, e1, e2, e3); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v4u32) uint32_t data[4] = {e0, e1, e2, e3}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5562,7 +5959,7 @@ simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) { r_.neon_u64 = vld1q_u64(data); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_u64x2_make(e0, e1); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) SIMDE_ALIGN_LIKE_16(v2u64) uint64_t data[2] = {e0, e1}; r_.lsx_i64 = __lsx_vld(data, 0); #else @@ -5583,7 +5980,7 @@ simde_mm_set_sd (simde_float64 a) { return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0); #elif defined(SIMDE_WASM_SIMD128_NATIVE) return simde__m128d_from_wasm_v128(wasm_f64x2_make(a, 0)); - #elif defined(SIMD_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) return (__m128d)__lsx_vinsgr2vr_d(__lsx_vldrepl_d(&a, 0), 0, 1); #else return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a); @@ -6152,24 +6549,27 @@ simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) { #else simde__m128i_private r_, - a_ = simde__m128i_to_private(a), + a_, count_ = simde__m128i_to_private(count); - if (count_.u64[0] > 15) + if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { return simde_mm_setzero_si128(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = (a_.u16 << count_.u64[0]); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0]))); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vslli_h(a_.lsx_i64, count_.u64[0]); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0)); + r_.wasm_v128 = wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u16 = a_.u16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(16, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0])); + r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << cnt)); } #endif @@ -6188,24 +6588,27 @@ simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) { #else simde__m128i_private r_, - a_ = simde__m128i_to_private(a), + a_, count_ = simde__m128i_to_private(count); - if (count_.u64[0] > 31) + if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { return simde_mm_setzero_si128(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = (a_.u32 << count_.u64[0]); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0]))); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vslli_w(a_.lsx_i64, count_.u64[0]); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0)); + r_.wasm_v128 = wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsll_w(a_.lsx_i64, __lsx_vreplgr2vr_w(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u32 = a_.u32 << SIMDE_CAST_VECTOR_SHIFT_COUNT(32, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0])); + r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << cnt)); } #endif @@ -6224,25 +6627,27 @@ simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) { #else simde__m128i_private r_, - a_ = simde__m128i_to_private(a), + a_, count_ = simde__m128i_to_private(count); - if (count_.u64[0] > 63) + if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { return simde_mm_setzero_si128(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m128i_to_private(a); - const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s))); + r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, s)) : wasm_i64x2_const(0,0); + r_.wasm_v128 = wasm_i64x2_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vsll_d(a_.lsx_i64, __lsx_vreplgr2vr_d(HEDLEY_STATIC_CAST(int64_t, s))); + r_.lsx_i64 = __lsx_vsll_d(a_.lsx_i64, __lsx_vreplgr2vr_d(cnt)); #else #if !defined(SIMDE_BUG_GCC_94488) SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] << s; + r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (a_.u64[i] << cnt)); } #endif @@ -6326,15 +6731,23 @@ simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) { #else simde__m128i_private r_, - a_ = simde__m128i_to_private(a), + a_, count_ = simde__m128i_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0])); + if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { + return simde_mm_setzero_si128(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m128i_to_private(a); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vsrl_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -6357,10 +6770,14 @@ simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) { #else simde__m128i_private r_, - a_ = simde__m128i_to_private(a), + a_, count_ = simde__m128i_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0])); + if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { + return simde_mm_setzero_si128(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m128i_to_private(a); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); @@ -6368,6 +6785,8 @@ simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) { r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vsrl_w(a_.lsx_i64, __lsx_vreplgr2vr_w(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -6390,10 +6809,14 @@ simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) { #else simde__m128i_private r_, - a_ = simde__m128i_to_private(a), + a_, count_ = simde__m128i_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0])); + if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { + return simde_mm_setzero_si128(); + } + const int cnt = HEDLEY_STATIC_CAST(int, count_.u64[0]); + a_ = simde__m128i_to_private(a); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt))); @@ -6433,7 +6856,7 @@ simde_mm_srai_epi16 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vsra_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); + r_.lsx_i64 = (simde__m128i)((v8i16)a_.lsx_i64 >> cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) { @@ -6466,7 +6889,7 @@ simde_mm_srai_epi32 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vsra_w(a_.lsx_i64, __lsx_vreplgr2vr_w(cnt)); + r_.lsx_i64 = (simde__m128i)((v4i32)a_.lsx_i64 >> cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) { @@ -6494,14 +6917,16 @@ simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) { a_ = simde__m128i_to_private(a), count_ = simde__m128i_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0])); + const int cnt = count_.u64[0] > 15 ? 15 : HEDLEY_STATIC_CAST(int, count_.u64[0]); #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vsra_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vsra_h(a_.lsx_i64, __lsx_vreplgr2vr_h(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i16 = a_.i16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -6535,6 +6960,8 @@ simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) { r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vsra_w(a_.lsx_i64, __lsx_vreplgr2vr_w(cnt)); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, cnt); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -6561,7 +6988,9 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v8i16)a_.lsx_i64 << imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); #else const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8; @@ -6589,8 +7018,6 @@ simde_mm_slli_epi16 (simde__m128i a, const int imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) #define simde_mm_slli_epi16(a, imm8) \ ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_slli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8) @@ -6607,7 +7034,9 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v4i32)a_.lsx_i64 << imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i32 = a_.i32 << imm8; #else SIMDE_VECTORIZE @@ -6646,8 +7075,6 @@ simde_mm_slli_epi32 (simde__m128i a, const int imm8) } \ ret; \ })) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_slli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8) @@ -6664,7 +7091,9 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v2i64)a_.lsx_i64 << imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i64 = a_.i64 << imm8; #else SIMDE_VECTORIZE @@ -6688,8 +7117,6 @@ simde_mm_slli_epi64 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_slli_epi64(a, imm8) \ ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_slli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vslli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8) @@ -6706,7 +7133,9 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v8u16)a_.lsx_i64 >> imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); #else SIMDE_VECTORIZE @@ -6733,8 +7162,6 @@ simde_mm_srli_epi16 (simde__m128i a, const int imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) #define simde_mm_srli_epi16(a, imm8) \ ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_srli_epi16(a, imm8) ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_h(simde__m128i_to_private(a).lsx_i64, ((imm8) & 15)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8) @@ -6751,7 +7178,9 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8) r_, a_ = simde__m128i_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + #if defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v4u32)a_.lsx_i64 >> imm8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); #else SIMDE_VECTORIZE @@ -6790,8 +7219,6 @@ simde_mm_srli_epi32 (simde__m128i a, const int imm8) } \ ret; \ })) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_srli_epi32(a, imm8) ((imm8 & ~31) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_w(simde__m128i_to_private(a).lsx_i64, ((imm8) & 31)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8) @@ -6810,6 +7237,8 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8) #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = (simde__m128i)((v2u64)a_.lsx_i64 >> imm8); #else #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488) r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); @@ -6836,8 +7265,6 @@ simde_mm_srli_epi64 (simde__m128i a, const int imm8) #elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_srli_epi64(a, imm8) \ ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - #define simde_mm_srli_epi64(a, imm8) ((imm8 & ~63) ? simde_mm_setzero_si128() : simde__m128i_from_lsx_i64(__lsx_vsrli_d(simde__m128i_to_private(a).lsx_i64, ((imm8) & 63)))) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8) @@ -6852,7 +7279,7 @@ simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) __lsx_vst(simde__m128d_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a)); @@ -7051,7 +7478,7 @@ simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) { _mm_storeu_pd(mem_addr, a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) __lsx_vst(simde__m128d_to_private(a).lsx_f64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -7066,7 +7493,7 @@ void simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_BUG_GCC_123766) __lsx_vst(simde__m128i_to_private(a).lsx_i64, mem_addr, 0); #else simde_memcpy(mem_addr, &a, sizeof(a)); @@ -7351,12 +7778,12 @@ simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) { a_ = simde__m128i_to_private(a), b_ = simde__m128i_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 - b_.u32; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vsub_w(a_.lsx_i64, b_.lsx_i64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = a_.u32 - b_.u32; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -7378,14 +7805,14 @@ simde_mm_sub_pd (simde__m128d a, simde__m128d b) { a_ = simde__m128d_to_private(a), b_ = simde__m128d_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 - b_.f64; - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_f64 = __lsx_vfsub_d(a_.lsx_f64, b_.lsx_f64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.f64 = a_.f64 - b_.f64; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -7438,10 +7865,10 @@ simde_mm_sub_si64 (simde__m64 a, simde__m64 b) { a_ = simde__m64_to_private(a), b_ = simde__m64_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 - b_.i64; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vsub_s64(a_.neon_i64, b_.neon_i64); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = a_.i64 - b_.i64; #else r_.i64[0] = a_.i64[0] - b_.i64[0]; #endif diff --git a/thirdparty/simde/x86/sse3.h b/thirdparty/simde/x86/sse3.h index 4f83a5105..e46a655cf 100644 --- a/thirdparty/simde/x86/sse3.h +++ b/thirdparty/simde/x86/sse3.h @@ -27,6 +27,13 @@ #if !defined(SIMDE_X86_SSE3_H) #define SIMDE_X86_SSE3_H +#include + +#include "../hedley.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-common.h" +#include "sse.h" #include "sse2.h" HEDLEY_DIAGNOSTIC_PUSH @@ -434,7 +441,14 @@ simde_mm_loaddup_pd (simde_float64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vdupq_n_f64(*mem_addr); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if HEDLEY_HAS_WARNING("-Wundefined-reinterpret-cast") && SIMDE_DETECT_CLANG_VERSION_CHECK(21, 0, 0) + HEDLEY_DIAGNOSTIC_PUSH + _Pragma("clang diagnostic ignored \"-Wundefined-reinterpret-cast\"") + #endif r_.neon_i64 = vdupq_n_s64(*HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); + #if HEDLEY_HAS_WARNING("-Wundefined-reinterpret-cast") && SIMDE_DETECT_CLANG_VERSION_CHECK(21, 0, 0) + HEDLEY_DIAGNOSTIC_POP + #endif #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vldrepl_d(mem_addr, 0); #else @@ -479,6 +493,11 @@ simde_mm_movedup_pd (simde__m128d a) { # define _mm_movedup_pd(a) simde_mm_movedup_pd(a) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_movehdup_ps (simde__m128 a) { @@ -543,6 +562,10 @@ simde_mm_moveldup_ps (simde__m128 a) { # define _mm_moveldup_ps(a) simde_mm_moveldup_ps(a) #endif +#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) +HEDLEY_DIAGNOSTIC_POP +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/thirdparty/simde/x86/sse4.1.h b/thirdparty/simde/x86/sse4.1.h index ba2bf1869..46fb1ef30 100644 --- a/thirdparty/simde/x86/sse4.1.h +++ b/thirdparty/simde/x86/sse4.1.h @@ -24,10 +24,20 @@ * 2017-2020 Evan Nemerson */ -#include "sse.h" #if !defined(SIMDE_X86_SSE4_1_H) #define SIMDE_X86_SSE4_1_H +#include +#include + +#include "../hedley.h" +#include "../simde-arch.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-math.h" +#include "../simde-common.h" +#include "sse.h" +#include "sse2.h" #include "ssse3.h" HEDLEY_DIAGNOSTIC_PUSH @@ -416,126 +426,6 @@ simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) { #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask) #endif -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_round_pd (simde__m128d a, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - /* For architectures which lack a current direction SIMD instruction. */ - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) - rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; - #endif - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_CUR_DIRECTION: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndiq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); - #elif defined(simde_math_nearbyint) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_nearbyint(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEAREST_INT: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndaq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrne_d(a_.lsx_f64); - #elif defined(simde_math_roundeven) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_roundeven(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEG_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndmq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_floor(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrm_d(a_.lsx_f64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); - } - #endif - break; - - case SIMDE_MM_FROUND_TO_POS_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndpq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_ceil(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrp_d(a_.lsx_f64); - #elif defined(simde_math_ceil) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_ZERO: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_trunc(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f64 = __lsx_vfrintrz_d(a_.lsx_f64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); - } - #endif - break; - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_round_pd - #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding) -#endif - SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_ceil_pd (simde__m128d a) { @@ -638,16 +528,18 @@ simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) { uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32); uint32x4_t swapped = vrev64q_u32(cmp); r_.neon_u32 = vandq_u32(cmp, swapped); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 == b_.i64); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64)); #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) r_.lsx_i64 = __lsx_vseq_d(a_.lsx_i64, b_.lsx_i64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_eq(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 == b_.i64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? UINT64_MAX : UINT64_C(0); } #endif @@ -659,6 +551,68 @@ simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) { #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpneq_epi64 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_ne(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 != b_.i64); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] != b_.u64[i]) ? UINT64_MAX : UINT64_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpeq_epu64 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 == b_.u64); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? UINT64_MAX : UINT64_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i +simde_x_mm_cmpneq_epu64 (simde__m128i a, simde__m128i b) { + simde__m128i_private + r_, + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 != b_.u64); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] != b_.u64[i]) ? UINT64_MAX : UINT64_C(0); + } + #endif + + return simde__m128i_from_private(r_); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtepi8_epi16 (simde__m128i a) { diff --git a/thirdparty/simde/x86/sse4.2.h b/thirdparty/simde/x86/sse4.2.h index a0723952c..1435e2a8d 100644 --- a/thirdparty/simde/x86/sse4.2.h +++ b/thirdparty/simde/x86/sse4.2.h @@ -94,6 +94,162 @@ SIMDE_BEGIN_DECLS_ #define _SIDD_UNIT_MASK SIMDE_SIDD_UNIT_MASK #endif +#if 0 // not yet implemented + +SIMDE_FUNCTION_ATTRIBUTES +int simde_mm_cmpestra (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + return 0; +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpestra(a, la, b, lb, imm8) _mm_cmpestra(a, la, b, lb, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpestra + #define _mm_cmpestra(a, la, b, lb, imm8) simde_mm_cmpestra(a, la, b, lb, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int simde_mm_cmpestrc (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + return 0; +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpestrc(a, la, b, lb, imm8) _mm_cmpestrc(a, la, b, lb, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpestrc + #define _mm_cmpestrc(a, la, b, lb, imm8) simde_mm_cmpestrc(a, la, b, lb, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int simde_mm_cmpestri (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + return 0; +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpestri(a, la, b, lb, imm8) _mm_cmpestri(a, la, b, lb, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpestri + #define _mm_cmpestri(a, la, b, lb, imm8) simde_mm_cmpestri(a, la, b, lb, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmpestrm (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + simde__m128i_private result_ = simde__m128i_to_private(simde_mm_setzero_si128()); + return simde__m128i_from_private(result_); +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpestrm(a, la, b, lb, imm8) _mm_cmpestrm(a, la, b, lb, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpestrm + #define _mm_cmpestrm(a, la, b, lb, imm8) simde_mm_cmpestrm(a, la, b, lb, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int simde_mm_cmpestro (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + return 0; +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpestro(a, la, b, lb, imm8) _mm_cmpestro(a, la, b, lb, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpestro + #define _mm_cmpestro(a, la, b, lb, imm8) simde_mm_cmpestro(a, la, b, lb, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int simde_mm_cmpistra (simde__m128i a, simde__m128i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + return 0; +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpistra(a, b, imm8) _mm_cmpistra(a, b, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpistra + #define _mm_cmpistra(a, b, imm8) simde_mm_cmpistra(a, b, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int simde_mm_cmpistrc (simde__m128i a, simde__m128i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + return 0; +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpistrc(a, b, imm8) _mm_cmpistrc(a, b, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpistrc + #define _mm_cmpistrc(a, b, imm8) simde_mm_cmpistrc(a, b, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int simde_mm_cmpistri (simde__m128i a, simde__m128i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + return 0; +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpistri(a, b, imm8) _mm_cmpistri(a, b, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpistri + #define _mm_cmpistri(a, b, imm8) simde_mm_cmpistri(a, b, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_cmpistrm (simde__m128i a, simde__m128i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + simde__m128i_private result_ = simde__m128i_to_private(simde_mm_setzero_si128()); + return simde__m128i_from_private(result_); +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpistrm(a, b, imm8) _mm_cmpistrm(a, b, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpistrm + #define _mm_cmpistrm(a, b, imm8) simde_mm_cmpistrm(a, b, imm8) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int simde_mm_cmpistro (simde__m128i a, simde__m128i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private b_ = simde__m128i_to_private(b); + return 0; +} +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpistro(a, b, imm8) _mm_cmpistro(a, b, imm8) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #undef _mm_cmpistro + #define _mm_cmpistro(a, b, imm8) simde_mm_cmpistro(a, b, imm8) +#endif + +#endif // unimplemented functions + SIMDE_FUNCTION_ATTRIBUTES int simde_mm_cmpestrs (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { diff --git a/thirdparty/simde/x86/ssse3.h b/thirdparty/simde/x86/ssse3.h index db60c2fb5..305d9ad19 100644 --- a/thirdparty/simde/x86/ssse3.h +++ b/thirdparty/simde/x86/ssse3.h @@ -27,6 +27,16 @@ #if !defined(SIMDE_X86_SSSE3_H) #define SIMDE_X86_SSSE3_H +#include +#include + +#include "../hedley.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-math.h" +#include "../simde-common.h" +#include "mmx.h" +#include "sse2.h" #include "sse3.h" HEDLEY_DIAGNOSTIC_PUSH @@ -893,7 +903,7 @@ simde_mm_sign_epi8 (simde__m128i a, simde__m128i b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0)); + r_.i8[i] = (b_.i8[i] < 0) ? HEDLEY_STATIC_CAST(int8_t, -a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0)); } #endif @@ -935,7 +945,7 @@ simde_mm_sign_epi16 (simde__m128i a, simde__m128i b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] != 0) ? (a_.i16[i]) : INT16_C(0)); + r_.i16[i] = (b_.i16[i] < 0) ? HEDLEY_STATIC_CAST(int16_t, -a_.i16[i]) : ((b_.i16[i] != 0) ? (a_.i16[i]) : INT16_C(0)); } #endif @@ -1012,7 +1022,7 @@ simde_mm_sign_pi8 (simde__m64 a, simde__m64 b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0)); + r_.i8[i] = (b_.i8[i] < 0) ? HEDLEY_STATIC_CAST(int8_t, -a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0)); } #endif @@ -1048,7 +1058,7 @@ simde_mm_sign_pi16 (simde__m64 a, simde__m64 b) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] > 0) ? (a_.i16[i]) : INT16_C(0)); + r_.i16[i] = (b_.i16[i] < 0) ? HEDLEY_STATIC_CAST(int16_t, -a_.i16[i]) : ((b_.i16[i] > 0) ? (a_.i16[i]) : INT16_C(0)); } #endif diff --git a/thirdparty/simde/x86/svml.h b/thirdparty/simde/x86/svml.h index 40fe0cd6d..77940443f 100644 --- a/thirdparty/simde/x86/svml.h +++ b/thirdparty/simde/x86/svml.h @@ -2071,7 +2071,7 @@ simde_x_mm_deg2rad_ps(simde__m128 a) { #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) r_.f32 = a_.f32 * SIMDE_MATH_PI_OVER_180F; #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f32) tmp = { SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F }; + const __typeof__(r_.f32) tmp = { SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F }; r_.f32 = a_.f32 * tmp; #else SIMDE_VECTORIZE @@ -2097,9 +2097,9 @@ simde_x_mm_deg2rad_pd(simde__m128d a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vmulq_n_f64(a_.neon_i64, SIMDE_MATH_PI_OVER_180); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f64 = a_.f64 * SIMDE_MATH_PI_OVER_180; + r_.f64 = a_.f64 * SIMDE_MATH_PI_OVER_180; #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f64) tmp = { SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180 }; + const __typeof__(r_.f64) tmp = { SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180 }; r_.f64 = a_.f64 * tmp; #else SIMDE_VECTORIZE @@ -2127,9 +2127,9 @@ simde_x_mm256_deg2rad_ps(simde__m256 a) { r_.m128[i] = simde_x_mm_deg2rad_ps(a_.m128[i]); } #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f32 = a_.f32 * SIMDE_MATH_PI_OVER_180F; + r_.f32 = a_.f32 * SIMDE_MATH_PI_OVER_180F; #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f32) tmp = { + const __typeof__(r_.f32) tmp = { SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F }; @@ -2160,9 +2160,9 @@ simde_x_mm256_deg2rad_pd(simde__m256d a) { r_.m128d[i] = simde_x_mm_deg2rad_pd(a_.m128d[i]); } #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f64 = a_.f64 * SIMDE_MATH_PI_OVER_180; + r_.f64 = a_.f64 * SIMDE_MATH_PI_OVER_180; #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f64) tmp = { SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180 }; + const __typeof__(r_.f64) tmp = { SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180 }; r_.f64 = a_.f64 * tmp; #else SIMDE_VECTORIZE @@ -2190,9 +2190,9 @@ simde_x_mm512_deg2rad_ps(simde__m512 a) { r_.m256[i] = simde_x_mm256_deg2rad_ps(a_.m256[i]); } #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f32 = a_.f32 * SIMDE_MATH_PI_OVER_180F; + r_.f32 = a_.f32 * SIMDE_MATH_PI_OVER_180F; #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f32) tmp = { + const __typeof__(r_.f32) tmp = { SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, @@ -2225,9 +2225,9 @@ simde_x_mm512_deg2rad_pd(simde__m512d a) { r_.m256d[i] = simde_x_mm256_deg2rad_pd(a_.m256d[i]); } #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f64 = a_.f64 * SIMDE_MATH_PI_OVER_180; + r_.f64 = a_.f64 * SIMDE_MATH_PI_OVER_180; #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f64) tmp = { + const __typeof__(r_.f64) tmp = { SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180 }; @@ -2685,8 +2685,6 @@ simde_mm_div_epi8 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = a_.i8 / b_.i8; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x4_div(a_.wasm_v128, b_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { @@ -2715,8 +2713,6 @@ simde_mm_div_epi16 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = a_.i16 / b_.i16; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x4_div(a_.wasm_v128, b_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -2745,8 +2741,6 @@ simde_mm_div_epi32 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 / b_.i32; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_div(a_.wasm_v128, b_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { @@ -2778,8 +2772,6 @@ simde_mm_div_epi64 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = a_.i64 / b_.i64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x4_div(a_.wasm_v128, b_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { @@ -2808,8 +2800,6 @@ simde_mm_div_epu8 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.u8 = a_.u8 / b_.u8; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_div(a_.wasm_v128, b_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { @@ -2838,8 +2828,6 @@ simde_mm_div_epu16 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.u16 = a_.u16 / b_.u16; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x16_div(a_.wasm_v128, b_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -2868,8 +2856,6 @@ simde_mm_div_epu32 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.u32 = a_.u32 / b_.u32; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x16_div(a_.wasm_v128, b_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { @@ -2901,8 +2887,6 @@ simde_mm_div_epu64 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.u64 = a_.u64 / b_.u64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u64x16_div(a_.wasm_v128, b_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { @@ -6319,7 +6303,7 @@ simde_mm512_cdfnorminv_ps (simde__m512 a) { matched = matched | mask; /* else */ - simde__mmask16 mask_el = ~matched; + simde__mmask16 mask_el = HEDLEY_STATIC_CAST(simde__mmask16, ~matched); /* r = a - 0.5f */ simde__m512 r = simde_mm512_sub_ps(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.5))); @@ -6427,7 +6411,7 @@ simde_mm512_cdfnorminv_pd (simde__m512d a) { matched = matched | mask; /* else */ - simde__mmask8 mask_el = ~matched; + simde__mmask8 mask_el = HEDLEY_STATIC_CAST(simde__mmask8, ~matched); /* r = a - 0.5f */ simde__m512d r = simde_mm512_sub_pd(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.5))); @@ -7524,7 +7508,7 @@ simde_mm512_erfcinv_ps (simde__m512 a) { { /* else */ /* (a >= 2.0f) */ - retval = simde_mm512_or_ps(retval, simde_mm512_maskz_mov_ps(~matched, simde_mm512_set1_ps(-SIMDE_MATH_INFINITYF))); + retval = simde_mm512_or_ps(retval, simde_mm512_maskz_mov_ps(HEDLEY_STATIC_CAST(simde__mmask16, ~matched), simde_mm512_set1_ps(-SIMDE_MATH_INFINITYF))); } return retval; @@ -7667,7 +7651,7 @@ simde_mm512_erfcinv_pd (simde__m512d a) { { /* else */ /* (a >= 2.0f) */ - retval = simde_mm512_or_pd(retval, simde_mm512_maskz_mov_pd(~matched, simde_mm512_set1_pd(-SIMDE_MATH_INFINITY))); + retval = simde_mm512_or_pd(retval, simde_mm512_maskz_mov_pd(HEDLEY_STATIC_CAST(simde__mmask8, ~matched), simde_mm512_set1_pd(-SIMDE_MATH_INFINITY))); } return retval; diff --git a/thirdparty/simde/x86/xop.h b/thirdparty/simde/x86/xop.h index 5249f06d7..01af09035 100644 --- a/thirdparty/simde/x86/xop.h +++ b/thirdparty/simde/x86/xop.h @@ -27,6 +27,17 @@ #if !defined(SIMDE_X86_XOP_H) #define SIMDE_X86_XOP_H +#include +#include + +#include "../hedley.h" +#include "../simde-diagnostic.h" +#include "../simde-features.h" +#include "../simde-math.h" +#include "../simde-common.h" +#include "sse.h" +#include "sse2.h" +#include "avx.h" #include "avx2.h" #if !defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)