Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 44 additions & 26 deletions simde/x86/avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -3615,22 +3615,31 @@ simde__m128i
simde_mm256_cvtpd_epi32 (simde__m256d a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvtpd_epi32(a);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
simde__m256d_private a_;
a_.i256 = __lasx_xvftintrne_w_d(a, a);
a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8);
return a_.m128d_private[0].lsx_i64;
#else
simde__m128i_private r_;
simde__m256d_private a_ = simde__m256d_to_private(a);
simde__m256d_private a_;

#if defined(simde_math_nearbyint)
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
a_ = simde__m256d_to_private(a);
r_.m64[0] = simde_mm_cvtpd_pi32(a_.m128d[0]);
r_.m64[1] = simde_mm_cvtpd_pi32(a_.m128d[1]);
#else
a_ = simde__m256d_to_private(simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_NEAREST_INT));
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i]));
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
simde_float64 v = simde_math_round(a_.f64[i]);
#if defined(SIMDE_FAST_CONVERSION_RANGE)
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
#else
r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#endif
}
#else
HEDLEY_UNREACHABLE();
#endif

return simde__m128i_from_private(r_);
Expand Down Expand Up @@ -3673,19 +3682,28 @@ simde__m256i
simde_mm256_cvtps_epi32 (simde__m256 a) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_cvtps_epi32(a);
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE)
#elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
return __lasx_xvftintrne_w_s(a);
#else
simde__m256i_private r_;
simde__m256_private a_ = simde__m256_to_private(a);
simde__m256_private a_;

#if defined(simde_math_nearbyintf)
#if SIMDE_NATURAL_VECTOR_SIZE_LE(128)
a_ = simde__m256_to_private(a);
r_.m128i[0] = simde_mm_cvtps_epi32(a_.m128[0]);
r_.m128i[1] = simde_mm_cvtps_epi32(a_.m128[1]);
#else
a_ = simde__m256_to_private(simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT));
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[i]));
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
simde_float32 v = simde_math_roundf(a_.f32[i]);
#if defined(SIMDE_FAST_CONVERSION_RANGE)
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
#else
r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#endif
}
#else
HEDLEY_UNREACHABLE();
#endif

return simde__m256i_from_private(r_);
Expand Down Expand Up @@ -6652,7 +6670,6 @@ simde_mm256_testc_si256 (simde__m256i a, simde__m256i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testc_si256(a, b);
#else
int_fast32_t r = 0;
simde__m256i_private
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
Expand All @@ -6661,14 +6678,16 @@ simde_mm256_testc_si256 (simde__m256i a, simde__m256i b) {
a_.i256 = __lasx_xvandn_v(a_.i256, b_.i256);
a_.i256 = __lasx_xvmsknz_b(a_.i256);
return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1;
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
return simde_mm_testc_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testc_si128(a_.m128i[1], b_.m128i[1]);
#else
int_fast32_t r = 0;
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
r |= ~a_.i32f[i] & b_.i32f[i];
}
return HEDLEY_STATIC_CAST(int, !r);
#endif

return HEDLEY_STATIC_CAST(int, !r);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
Expand Down Expand Up @@ -6810,7 +6829,6 @@ simde_mm256_testz_si256 (simde__m256i a, simde__m256i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testz_si256(a, b);
#else
int_fast32_t r = 0;
simde__m256i_private
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);
Expand All @@ -6820,17 +6838,15 @@ simde_mm256_testz_si256 (simde__m256i a, simde__m256i b) {
a_.i256 = __lasx_xvmsknz_b(a_.i256);
return (__lasx_xvpickve2gr_w(a_.i256, 0) + __lasx_xvpickve2gr_w(a_.i256, 4)) ? 0 : 1;
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
r = simde_mm_testz_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testz_si128(a_.m128i[1], b_.m128i[1]);
return simde_mm_testz_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testz_si128(a_.m128i[1], b_.m128i[1]);
#else
int_fast32_t r = 0;
SIMDE_VECTORIZE_REDUCTION(|:r)
for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
r |= a_.i32f[i] & b_.i32f[i];
}

r = !r;
return HEDLEY_STATIC_CAST(int, !r);
#endif

return HEDLEY_STATIC_CAST(int, r);
#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
Expand Down Expand Up @@ -6992,25 +7008,27 @@ simde_mm256_testnzc_si256 (simde__m256i a, simde__m256i b) {
#if defined(SIMDE_X86_AVX_NATIVE)
return _mm256_testnzc_si256(a, b);
#else
int32_t rc = 0, rz = 0;
simde__m256i_private
a_ = simde__m256i_to_private(a),
b_ = simde__m256i_to_private(b);

#if defined(SIMDE_LOONGARCH_LASX_NATIVE)
int_fast32_t rc = 0, rz = 0;
__m256i m = __lasx_xvandn_v(a_.i256, b_.i256);
__m256i n = __lasx_xvand_v(a_.i256, b_.i256);
m = __lasx_xvmsknz_b(m); n = __lasx_xvmsknz_b(n);
rc = __lasx_xvpickve2gr_w(m, 0) + __lasx_xvpickve2gr_w(m, 4);
rz = __lasx_xvpickve2gr_w(n, 0) + __lasx_xvpickve2gr_w(n, 4);
return (rc != 0) && (rz != 0);
#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128)
return simde_mm_testnzc_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testnzc_si128(a_.m128i[1], b_.m128i[1]);
#else
int_fast32_t rc = 0, rz = 0;
for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) {
rc |= ~a_.i32f[i] & b_.i32f[i];
rz |= a_.i32f[i] & b_.i32f[i];
}

return !!(rc & rz);
return HEDLEY_STATIC_CAST(int, rc && rz);
#endif
#endif
}
Expand Down
41 changes: 37 additions & 4 deletions test/x86/avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -7107,7 +7107,20 @@ test_simde_mm256_cvtpd_epi32(SIMDE_MUNIT_TEST_ARGS) {
const struct {
simde__m256d a;
simde__m128i r;
} test_vec[8] = {
} test_vec[] = {
#if !defined(SIMDE_FAST_NANS)
{ simde_mm256_set_pd(SIMDE_MATH_NAN, -SIMDE_MATH_NAN, 0.0, 0.0),
simde_mm_set_epi32( INT32_MIN, INT32_MIN, 0, 0) },
#endif
#if !defined(SIMDE_FAST_CONVERSION_RANGE)
{ simde_mm256_set_pd(
HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) + 1),
HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) - 100),
HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1),
HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) + 100)),
simde_mm_set_epi32(
INT32_MIN, INT32_C(2147483547), INT32_MIN, -INT32_C(2147483548)) },
#endif
{ simde_mm256_set_pd(SIMDE_FLOAT64_C( 823.92), SIMDE_FLOAT64_C( -252.31),
SIMDE_FLOAT64_C( 311.42), SIMDE_FLOAT64_C( 639.08)),
simde_mm_set_epi32(INT32_C( 824), INT32_C(-252), INT32_C( 311), INT32_C( 639)) },
Expand Down Expand Up @@ -7187,7 +7200,22 @@ test_simde_mm256_cvtps_epi32(SIMDE_MUNIT_TEST_ARGS) {
const struct {
simde__m256 a;
simde__m256i r;
} test_vec[8] = {
} test_vec[] = {
#if !defined(SIMDE_FAST_NANS)
{ simde_mm256_set_ps(SIMDE_MATH_NAN, -SIMDE_MATH_NAN, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f),
simde_mm256_set_epi32( INT32_MIN, INT32_MIN, 0, 0, 0, 0, 0, 0) },
#endif
#if !defined(SIMDE_FAST_CONVERSION_RANGE)
{ simde_mm256_set_ps(
HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) + 1),
HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) - 100),
HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1),
HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) + 100),
0.f, 0.f, 0.f, 0.f),
simde_mm256_set_epi32(
INT32_MIN, INT32_C(2147483520), INT32_MIN, -INT32_C(2147483520),
0, 0, 0, 0) },
#endif
{ simde_mm256_set_ps(SIMDE_FLOAT32_C( 598.58), SIMDE_FLOAT32_C( 571.41),
SIMDE_FLOAT32_C( -242.37), SIMDE_FLOAT32_C( -717.41),
SIMDE_FLOAT32_C( 374.26), SIMDE_FLOAT32_C( -165.53),
Expand Down Expand Up @@ -16491,7 +16519,7 @@ test_simde_mm256_testnzc_si256(SIMDE_MUNIT_TEST_ARGS) {
simde__m256i a;
simde__m256i b;
int r;
} test_vec[8] = {
} test_vec[9] = {
{ simde_mm256_set_epi64x(INT64_C(-6804708873655136040), INT64_C( 4446918229480945172),
INT64_C(-6458803806102185271), INT64_C( 6419639704555297719)),
simde_mm256_set_epi64x(INT64_C( 4086527184939990173), INT64_C(-4592254743728630867),
Expand Down Expand Up @@ -16531,7 +16559,12 @@ test_simde_mm256_testnzc_si256(SIMDE_MUNIT_TEST_ARGS) {
INT64_C( -1), INT64_C( -1)),
simde_mm256_set_epi64x(INT64_C( 0), INT64_C( 0),
INT64_C( 0), INT64_C( 0)),
0 }
0 },
{ simde_mm256_set_epi64x(INT64_C( 1), INT64_C( 1),
INT64_C( 1), INT64_C( 1)),
simde_mm256_set_epi64x(INT64_C( -1), INT64_C( -1),
INT64_C( -1), INT64_C( -1)),
1 }
};

for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
Expand Down
Loading