89
89
#define _sse2neon_likely (x ) __builtin_expect(!!(x), 1)
90
90
#define _sse2neon_unlikely (x ) __builtin_expect(!!(x), 0)
91
91
#elif defined(_MSC_VER )
92
- #if _MSVC_TRADITIONAL
93
- #error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
94
- #endif
95
92
#ifndef FORCE_INLINE
96
93
#define FORCE_INLINE static inline
97
94
#endif
184
181
} while (0)
185
182
#endif
186
183
184
+ #ifdef _M_ARM
185
+ #define vst1q_lane_s64 (a , b , c )
186
+ #endif
187
+
187
188
/* Memory barriers
188
189
* __atomic_thread_fence does not include a compiler barrier; instead,
189
190
* the barrier is part of __atomic_load/__atomic_store's "volatile-like"
@@ -202,8 +203,12 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
202
203
#elif defined(__GNUC__ ) || defined(__clang__ )
203
204
__atomic_thread_fence (__ATOMIC_SEQ_CST );
204
205
#else /* MSVC */
206
+ #ifdef _M_ARM
207
+ __dmb (_ARM_BARRIER_ISH );
208
+ #else
205
209
__dmb (_ARM64_BARRIER_ISH );
206
210
#endif
211
+ #endif
207
212
}
208
213
209
214
/* Architecture-specific build options */
@@ -268,7 +273,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
268
273
* we have to perform syscall instead.
269
274
*/
270
275
#if (!defined(__aarch64__ ) && !defined(_M_ARM64 ))
271
- #include <sys/ time.h>
276
+ #include <time.h>
272
277
#endif
273
278
274
279
/* "__has_builtin" can be used to query support for built-in functions
@@ -574,10 +579,10 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
574
579
/* Backwards compatibility for compilers with lack of specific type support */
575
580
576
581
// Older gcc does not define vld1q_u8_x4 type
577
- #if defined(__GNUC__ ) && !defined(__clang__ ) && \
582
+ #if defined(_M_ARM ) || (defined( __GNUC__ ) && !defined(__clang__ ) && \
578
583
((__GNUC__ <= 12 && defined(__arm__ )) || \
579
584
(__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__ )) || \
580
- (__GNUC__ <= 9 && defined(__aarch64__ )))
585
+ (__GNUC__ <= 9 && defined(__aarch64__ ))))
581
586
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4 (const uint8_t * p )
582
587
{
583
588
uint8x16x4_t ret ;
@@ -610,6 +615,9 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
610
615
}
611
616
#endif
612
617
618
+ #if defined(_M_ARM )
619
+ #pragma message("TODO: Windows ARM32: Port many SSE2NEON functions")
620
+ #else
613
621
#if !defined(__aarch64__ ) && !defined(_M_ARM64 )
614
622
/* emulate vaddvq u8 variant */
615
623
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8 (uint8x16_t a )
@@ -645,6 +653,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
645
653
return vaddvq_u16 (a );
646
654
}
647
655
#endif
656
+ #endif
648
657
649
658
/* Function Naming Conventions
650
659
* The naming convention of SSE intrinsics is straightforward. A generic SSE
@@ -1765,6 +1774,7 @@ FORCE_INLINE void _mm_free(void *addr)
1765
1774
}
1766
1775
#endif
1767
1776
1777
+ #ifndef _M_ARM
1768
1778
FORCE_INLINE uint64_t _sse2neon_get_fpcr ()
1769
1779
{
1770
1780
uint64_t value ;
@@ -1808,6 +1818,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
1808
1818
1809
1819
return r .field .bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF ;
1810
1820
}
1821
+ #endif
1811
1822
1812
1823
// Macro: Get the rounding mode bits from the MXCSR control and status register.
1813
1824
// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
@@ -1826,6 +1837,8 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
1826
1837
1827
1838
#if defined(__aarch64__ ) || defined(_M_ARM64 )
1828
1839
r .value = _sse2neon_get_fpcr ();
1840
+ #elif defined(_M_ARM )
1841
+ r .value = _MoveFromCoprocessor (10 ,7 , 1 ,0 ,0 );
1829
1842
#else
1830
1843
__asm__ __volatile__("vmrs %0, FPSCR" : "=r" (r .value )); /* read */
1831
1844
#endif
@@ -2247,7 +2260,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2247
2260
FORCE_INLINE void _mm_prefetch (char const * p , int i )
2248
2261
{
2249
2262
(void ) i ;
2250
- #if defined( _MSC_VER )
2263
+ #ifdef _M_ARM64
2251
2264
switch (i ) {
2252
2265
case _MM_HINT_NTA :
2253
2266
__prefetch2 (p , 1 );
@@ -2262,6 +2275,8 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
2262
2275
__prefetch2 (p , 4 );
2263
2276
break ;
2264
2277
}
2278
+ #elif defined(_M_ARM )
2279
+ // TODO
2265
2280
#else
2266
2281
switch (i ) {
2267
2282
case _MM_HINT_NTA :
@@ -2348,6 +2363,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2348
2363
vset_lane_u16 ((int ) vget_lane_u64 (t , 0 ), vdup_n_u16 (0 ), 0 ));
2349
2364
}
2350
2365
2366
+ #ifndef _M_ARM
2351
2367
// Macro: Set the flush zero bits of the MXCSR control and status register to
2352
2368
// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2353
2369
// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
@@ -2379,6 +2395,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2379
2395
__asm__ __volatile__("vmsr FPSCR, %0" ::"r" (r )); /* write */
2380
2396
#endif
2381
2397
}
2398
+ #endif
2382
2399
2383
2400
// Set packed single-precision (32-bit) floating-point elements in dst with the
2384
2401
// supplied values.
@@ -2404,6 +2421,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
2404
2421
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
2405
2422
FORCE_INLINE void _MM_SET_ROUNDING_MODE (int rounding )
2406
2423
{
2424
+ #ifndef _M_ARM
2407
2425
union {
2408
2426
fpcr_bitfield field ;
2409
2427
#if defined(__aarch64__ ) || defined(_M_ARM64 )
@@ -2442,6 +2460,7 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2442
2460
#else
2443
2461
__asm__ __volatile__("vmsr FPSCR, %0" ::"r" (r )); /* write */
2444
2462
#endif
2463
+ #endif
2445
2464
}
2446
2465
2447
2466
// Copy single-precision (32-bit) floating-point element a to the lower element
@@ -3206,6 +3225,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3206
3225
return _mm_move_sd (a , _mm_cmpeq_pd (a , b ));
3207
3226
}
3208
3227
3228
+ #ifndef _M_ARM
3209
3229
// Compare packed double-precision (64-bit) floating-point elements in a and b
3210
3230
// for greater-than-or-equal, and store the results in dst.
3211
3231
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
@@ -3247,6 +3267,7 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3247
3267
return vreinterpretq_m128d_u64 (vld1q_u64 (d ));
3248
3268
#endif
3249
3269
}
3270
+ #endif
3250
3271
3251
3272
// Compare packed signed 16-bit integers in a and b for greater-than, and store
3252
3273
// the results in dst.
@@ -3275,6 +3296,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3275
3296
vcgtq_s8 (vreinterpretq_s8_m128i (a ), vreinterpretq_s8_m128i (b )));
3276
3297
}
3277
3298
3299
+ #ifndef _M_ARM
3278
3300
// Compare packed double-precision (64-bit) floating-point elements in a and b
3279
3301
// for greater-than, and store the results in dst.
3280
3302
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
@@ -3358,6 +3380,7 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3358
3380
return vreinterpretq_m128d_u64 (vld1q_u64 (d ));
3359
3381
#endif
3360
3382
}
3383
+ #endif
3361
3384
3362
3385
// Compare packed signed 16-bit integers in a and b for less-than, and store the
3363
3386
// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
@@ -3389,6 +3412,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3389
3412
vcltq_s8 (vreinterpretq_s8_m128i (a ), vreinterpretq_s8_m128i (b )));
3390
3413
}
3391
3414
3415
+ #ifndef _M_ARM
3392
3416
// Compare packed double-precision (64-bit) floating-point elements in a and b
3393
3417
// for less-than, and store the results in dst.
3394
3418
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
@@ -3429,6 +3453,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3429
3453
return vreinterpretq_m128d_u64 (vld1q_u64 (d ));
3430
3454
#endif
3431
3455
}
3456
+ #endif
3432
3457
3433
3458
// Compare packed double-precision (64-bit) floating-point elements in a and b
3434
3459
// for not-equal, and store the results in dst.
@@ -3456,6 +3481,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3456
3481
return _mm_move_sd (a , _mm_cmpneq_pd (a , b ));
3457
3482
}
3458
3483
3484
+ #ifndef _M_ARM
3459
3485
// Compare packed double-precision (64-bit) floating-point elements in a and b
3460
3486
// for not-greater-than-or-equal, and store the results in dst.
3461
3487
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
@@ -3756,6 +3782,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3756
3782
return (* (double * ) & a0 < * (double * ) & b0 );
3757
3783
#endif
3758
3784
}
3785
+ #endif
3759
3786
3760
3787
// Compare the lower double-precision (64-bit) floating-point element in a and b
3761
3788
// for equality, and return the boolean result (0 or 1).
@@ -4401,6 +4428,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4401
4428
vmaxq_u8 (vreinterpretq_u8_m128i (a ), vreinterpretq_u8_m128i (b )));
4402
4429
}
4403
4430
4431
+ #ifndef _M_ARM
4404
4432
// Compare packed double-precision (64-bit) floating-point elements in a and b,
4405
4433
// and store packed maximum values in dst.
4406
4434
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
@@ -4487,6 +4515,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4487
4515
return vreinterpretq_m128d_u64 (vld1q_u64 (d ));
4488
4516
#endif
4489
4517
}
4518
+ #endif
4490
4519
4491
4520
// Compare the lower double-precision (64-bit) floating-point elements in a and
4492
4521
// b, store the minimum value in the lower element of dst, and copy the upper
@@ -4793,7 +4822,11 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4793
4822
FORCE_INLINE void _mm_pause ()
4794
4823
{
4795
4824
#if defined(_MSC_VER )
4825
+ #ifdef _M_ARM
4826
+ __isb (_ARM_BARRIER_SY );
4827
+ #else
4796
4828
__isb (_ARM64_BARRIER_SY );
4829
+ #endif
4797
4830
#else
4798
4831
__asm__ __volatile__("isb\n" );
4799
4832
#endif
@@ -7622,6 +7655,7 @@ FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7622
7655
}
7623
7656
7624
7657
/* SSE4.2 */
7658
+ #ifndef _M_ARM
7625
7659
7626
7660
const static uint16_t ALIGN_STRUCT (16 ) _sse2neon_cmpestr_mask16b [8 ] = {
7627
7661
0x01 , 0x02 , 0x04 , 0x08 , 0x10 , 0x20 , 0x40 , 0x80 ,
@@ -8463,9 +8497,11 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8463
8497
return crc ;
8464
8498
}
8465
8499
8500
+ #endif
8501
+
8466
8502
/* AES */
8467
8503
8468
- #if !defined(__ARM_FEATURE_CRYPTO ) && !defined(_M_ARM64 )
8504
+ #if !defined(__ARM_FEATURE_CRYPTO ) && !defined(_M_ARM64 ) && !defined( _M_ARM )
8469
8505
/* clang-format off */
8470
8506
#define SSE2NEON_AES_SBOX (w ) \
8471
8507
{ \
@@ -8913,6 +8949,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8913
8949
#undef SSE2NEON_MULTIPLY
8914
8950
#endif
8915
8951
8952
+ #elif defined(_M_ARM )
8916
8953
#else /* __ARM_FEATURE_CRYPTO */
8917
8954
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8918
8955
// AESMC and then manually applying the real key as an xor operation. This
@@ -9034,6 +9071,7 @@ FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
9034
9071
}
9035
9072
}
9036
9073
9074
+ #ifndef _M_ARM
9037
9075
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode ()
9038
9076
{
9039
9077
union {
@@ -9053,6 +9091,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
9053
9091
9054
9092
return r .field .bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF ;
9055
9093
}
9094
+ #endif
9056
9095
9057
9096
// Count the number of bits set to 1 in unsigned 32-bit integer a, and
9058
9097
// return that count in dst.
@@ -9113,6 +9152,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
9113
9152
#endif
9114
9153
}
9115
9154
9155
+ #ifndef _M_ARM
9116
9156
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode (unsigned int flag )
9117
9157
{
9118
9158
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
@@ -9140,6 +9180,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
9140
9180
__asm__ __volatile__("vmsr FPSCR, %0" ::"r" (r )); /* write */
9141
9181
#endif
9142
9182
}
9183
+ #endif
9143
9184
9144
9185
// Return the current 64-bit value of the processor's time-stamp counter.
9145
9186
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
@@ -9161,6 +9202,9 @@ FORCE_INLINE uint64_t _rdtsc(void)
9161
9202
#endif
9162
9203
9163
9204
return val ;
9205
+ #elif defined(_M_ARM )
9206
+ uint32_t val = _MoveFromCoprocessor (15 ,0 , 9 ,13 ,0 );
9207
+ return ((uint64_t )val ) << 6 ;
9164
9208
#else
9165
9209
uint32_t pmccntr , pmuseren , pmcntenset ;
9166
9210
// Read the user mode Performance Monitoring Unit (PMU)
0 commit comments