Skip to content

Commit ee04c66

Browse files
committed
99% done Windows ARM32 port
1 parent bee7600 commit ee04c66

37 files changed

+591
-136
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
*~
12
*.mak
23
*.mak.vpc_crc
34
*.vpc_crc

common/sse2neon.h

+52-8
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,6 @@
8989
#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
9090
#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
9191
#elif defined(_MSC_VER)
92-
#if _MSVC_TRADITIONAL
93-
#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
94-
#endif
9592
#ifndef FORCE_INLINE
9693
#define FORCE_INLINE static inline
9794
#endif
@@ -184,6 +181,10 @@
184181
} while (0)
185182
#endif
186183

184+
#ifdef _M_ARM
185+
#define vst1q_lane_s64(a, b, c)
186+
#endif
187+
187188
/* Memory barriers
188189
* __atomic_thread_fence does not include a compiler barrier; instead,
189190
* the barrier is part of __atomic_load/__atomic_store's "volatile-like"
@@ -202,8 +203,12 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
202203
#elif defined(__GNUC__) || defined(__clang__)
203204
__atomic_thread_fence(__ATOMIC_SEQ_CST);
204205
#else /* MSVC */
206+
#ifdef _M_ARM
207+
__dmb(_ARM_BARRIER_ISH);
208+
#else
205209
__dmb(_ARM64_BARRIER_ISH);
206210
#endif
211+
#endif
207212
}
208213

209214
/* Architecture-specific build options */
@@ -268,7 +273,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
268273
* we have to perform syscall instead.
269274
*/
270275
#if (!defined(__aarch64__) && !defined(_M_ARM64))
271-
#include <sys/time.h>
276+
#include <time.h>
272277
#endif
273278

274279
/* "__has_builtin" can be used to query support for built-in functions
@@ -574,10 +579,10 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
574579
/* Backwards compatibility for compilers with lack of specific type support */
575580

576581
// Older gcc does not define vld1q_u8_x4 type
577-
#if defined(__GNUC__) && !defined(__clang__) && \
582+
#if defined(_M_ARM) || (defined(__GNUC__) && !defined(__clang__) && \
578583
((__GNUC__ <= 12 && defined(__arm__)) || \
579584
(__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
580-
(__GNUC__ <= 9 && defined(__aarch64__)))
585+
(__GNUC__ <= 9 && defined(__aarch64__))))
581586
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
582587
{
583588
uint8x16x4_t ret;
@@ -610,6 +615,9 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
610615
}
611616
#endif
612617

618+
#if defined(_M_ARM)
619+
#pragma message("TODO: Windows ARM32: Port many SSE2NEON functions")
620+
#else
613621
#if !defined(__aarch64__) && !defined(_M_ARM64)
614622
/* emulate vaddvq u8 variant */
615623
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
@@ -645,6 +653,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
645653
return vaddvq_u16(a);
646654
}
647655
#endif
656+
#endif
648657

649658
/* Function Naming Conventions
650659
* The naming convention of SSE intrinsics is straightforward. A generic SSE
@@ -1765,6 +1774,7 @@ FORCE_INLINE void _mm_free(void *addr)
17651774
}
17661775
#endif
17671776

1777+
#ifndef _M_ARM
17681778
FORCE_INLINE uint64_t _sse2neon_get_fpcr()
17691779
{
17701780
uint64_t value;
@@ -1808,6 +1818,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
18081818

18091819
return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
18101820
}
1821+
#endif
18111822

18121823
// Macro: Get the rounding mode bits from the MXCSR control and status register.
18131824
// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
@@ -1826,6 +1837,8 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
18261837

18271838
#if defined(__aarch64__) || defined(_M_ARM64)
18281839
r.value = _sse2neon_get_fpcr();
1840+
#elif defined(_M_ARM)
1841+
r.value = _MoveFromCoprocessor(10,7, 1,0,0);
18291842
#else
18301843
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
18311844
#endif
@@ -2247,7 +2260,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
22472260
FORCE_INLINE void _mm_prefetch(char const *p, int i)
22482261
{
22492262
(void) i;
2250-
#if defined(_MSC_VER)
2263+
#ifdef _M_ARM64
22512264
switch (i) {
22522265
case _MM_HINT_NTA:
22532266
__prefetch2(p, 1);
@@ -2262,6 +2275,8 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
22622275
__prefetch2(p, 4);
22632276
break;
22642277
}
2278+
#elif defined(_M_ARM)
2279+
// TODO
22652280
#else
22662281
switch (i) {
22672282
case _MM_HINT_NTA:
@@ -2348,6 +2363,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
23482363
vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
23492364
}
23502365

2366+
#ifndef _M_ARM
23512367
// Macro: Set the flush zero bits of the MXCSR control and status register to
23522368
// the value in unsigned 32-bit integer a. The flush zero may contain any of the
23532369
// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
@@ -2379,6 +2395,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
23792395
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
23802396
#endif
23812397
}
2398+
#endif
23822399

23832400
// Set packed single-precision (32-bit) floating-point elements in dst with the
23842401
// supplied values.
@@ -2404,6 +2421,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
24042421
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
24052422
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
24062423
{
2424+
#ifndef _M_ARM
24072425
union {
24082426
fpcr_bitfield field;
24092427
#if defined(__aarch64__) || defined(_M_ARM64)
@@ -2442,6 +2460,7 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
24422460
#else
24432461
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
24442462
#endif
2463+
#endif
24452464
}
24462465

24472466
// Copy single-precision (32-bit) floating-point element a to the lower element
@@ -3206,6 +3225,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
32063225
return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
32073226
}
32083227

3228+
#ifndef _M_ARM
32093229
// Compare packed double-precision (64-bit) floating-point elements in a and b
32103230
// for greater-than-or-equal, and store the results in dst.
32113231
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
@@ -3247,6 +3267,7 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
32473267
return vreinterpretq_m128d_u64(vld1q_u64(d));
32483268
#endif
32493269
}
3270+
#endif
32503271

32513272
// Compare packed signed 16-bit integers in a and b for greater-than, and store
32523273
// the results in dst.
@@ -3275,6 +3296,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
32753296
vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
32763297
}
32773298

3299+
#ifndef _M_ARM
32783300
// Compare packed double-precision (64-bit) floating-point elements in a and b
32793301
// for greater-than, and store the results in dst.
32803302
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
@@ -3358,6 +3380,7 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
33583380
return vreinterpretq_m128d_u64(vld1q_u64(d));
33593381
#endif
33603382
}
3383+
#endif
33613384

33623385
// Compare packed signed 16-bit integers in a and b for less-than, and store the
33633386
// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
@@ -3389,6 +3412,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
33893412
vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
33903413
}
33913414

3415+
#ifndef _M_ARM
33923416
// Compare packed double-precision (64-bit) floating-point elements in a and b
33933417
// for less-than, and store the results in dst.
33943418
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
@@ -3429,6 +3453,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
34293453
return vreinterpretq_m128d_u64(vld1q_u64(d));
34303454
#endif
34313455
}
3456+
#endif
34323457

34333458
// Compare packed double-precision (64-bit) floating-point elements in a and b
34343459
// for not-equal, and store the results in dst.
@@ -3456,6 +3481,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
34563481
return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
34573482
}
34583483

3484+
#ifndef _M_ARM
34593485
// Compare packed double-precision (64-bit) floating-point elements in a and b
34603486
// for not-greater-than-or-equal, and store the results in dst.
34613487
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
@@ -3756,6 +3782,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
37563782
return (*(double *) &a0 < *(double *) &b0);
37573783
#endif
37583784
}
3785+
#endif
37593786

37603787
// Compare the lower double-precision (64-bit) floating-point element in a and b
37613788
// for equality, and return the boolean result (0 or 1).
@@ -4401,6 +4428,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
44014428
vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
44024429
}
44034430

4431+
#ifndef _M_ARM
44044432
// Compare packed double-precision (64-bit) floating-point elements in a and b,
44054433
// and store packed maximum values in dst.
44064434
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
@@ -4487,6 +4515,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
44874515
return vreinterpretq_m128d_u64(vld1q_u64(d));
44884516
#endif
44894517
}
4518+
#endif
44904519

44914520
// Compare the lower double-precision (64-bit) floating-point elements in a and
44924521
// b, store the minimum value in the lower element of dst, and copy the upper
@@ -4793,7 +4822,11 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
47934822
FORCE_INLINE void _mm_pause()
47944823
{
47954824
#if defined(_MSC_VER)
4825+
#ifdef _M_ARM
4826+
__isb(_ARM_BARRIER_SY);
4827+
#else
47964828
__isb(_ARM64_BARRIER_SY);
4829+
#endif
47974830
#else
47984831
__asm__ __volatile__("isb\n");
47994832
#endif
@@ -7622,6 +7655,7 @@ FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
76227655
}
76237656

76247657
/* SSE4.2 */
7658+
#ifndef _M_ARM
76257659

76267660
const static uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
76277661
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
@@ -8463,9 +8497,11 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
84638497
return crc;
84648498
}
84658499

8500+
#endif
8501+
84668502
/* AES */
84678503

8468-
#if !defined(__ARM_FEATURE_CRYPTO) && !defined(_M_ARM64)
8504+
#if !defined(__ARM_FEATURE_CRYPTO) && !defined(_M_ARM64) && !defined(_M_ARM)
84698505
/* clang-format off */
84708506
#define SSE2NEON_AES_SBOX(w) \
84718507
{ \
@@ -8913,6 +8949,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
89138949
#undef SSE2NEON_MULTIPLY
89148950
#endif
89158951

8952+
#elif defined(_M_ARM)
89168953
#else /* __ARM_FEATURE_CRYPTO */
89178954
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
89188955
// AESMC and then manually applying the real key as an xor operation. This
@@ -9034,6 +9071,7 @@ FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
90349071
}
90359072
}
90369073

9074+
#ifndef _M_ARM
90379075
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
90389076
{
90399077
union {
@@ -9053,6 +9091,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
90539091

90549092
return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
90559093
}
9094+
#endif
90569095

90579096
// Count the number of bits set to 1 in unsigned 32-bit integer a, and
90589097
// return that count in dst.
@@ -9113,6 +9152,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
91139152
#endif
91149153
}
91159154

9155+
#ifndef _M_ARM
91169156
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
91179157
{
91189158
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
@@ -9140,6 +9180,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
91409180
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
91419181
#endif
91429182
}
9183+
#endif
91439184

91449185
// Return the current 64-bit value of the processor's time-stamp counter.
91459186
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
@@ -9161,6 +9202,9 @@ FORCE_INLINE uint64_t _rdtsc(void)
91619202
#endif
91629203

91639204
return val;
9205+
#elif defined(_M_ARM)
9206+
uint32_t val = _MoveFromCoprocessor(15,0, 9,13,0);
9207+
return ((uint64_t)val) << 6;
91649208
#else
91659209
uint32_t pmccntr, pmuseren, pmcntenset;
91669210
// Read the user mode Performance Monitoring Unit (PMU)

engine/cmodel.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -862,7 +862,7 @@ BOX TRACING
862862

863863
// Custom SIMD implementation for box brushes
864864

865-
const fltx4 Four_DistEpsilons={DIST_EPSILON,DIST_EPSILON,DIST_EPSILON,DIST_EPSILON};
865+
const fltx4 Four_DistEpsilons=FLTX4(DIST_EPSILON,DIST_EPSILON,DIST_EPSILON,DIST_EPSILON);
866866
const int32 ALIGN16 g_CubeFaceIndex0[4] ALIGN16_POST = {0,1,2,-1};
867867
const int32 ALIGN16 g_CubeFaceIndex1[4] ALIGN16_POST = {3,4,5,-1};
868868
bool IntersectRayWithBoxBrush( TraceInfo_t *pTraceInfo, const cbrush_t *pBrush, cboxbrush_t *pBox )
@@ -1572,7 +1572,7 @@ void FASTCALL CM_TraceToLeaf( TraceInfo_t * RESTRICT pTraceInfo, int ndxLeaf, fl
15721572
fltx4 traceStart = LoadUnaligned3SIMD(pTraceInfo->m_start.Base());
15731573
fltx4 traceDelta = LoadUnaligned3SIMD(pTraceInfo->m_delta.Base());
15741574
fltx4 traceInvDelta = LoadUnaligned3SIMD(pTraceInfo->m_invDelta.Base());
1575-
static const fltx4 vecEpsilon = {DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON};
1575+
static const fltx4 vecEpsilon = FLTX4(DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON);
15761576
// only used in !IS_POINT version:
15771577
fltx4 extents;
15781578
if (!IS_POINT)

engine/l_studio.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
#include "materialsystem/materialsystem_config.h"
4141
#include "materialsystem/itexture.h"
4242
#include "IHammer.h"
43-
#if defined( _WIN32 ) && !defined( _X360 )
43+
#if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
4444
#include <xmmintrin.h>
4545
#endif
4646
#include "staticpropmgr.h"

engine/sys_engine.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ extern ConVar host_timer_spin_ms;
104104
extern float host_nexttick;
105105
extern IVEngineClient *engineClient;
106106

107-
#ifdef WIN32
107+
#if defined(_WIN32) && !defined(_M_ARM)
108108
static void cpu_frequency_monitoring_callback( IConVar *var, const char *pOldValue, float flOldValue )
109109
{
110110
// Set the specified interval for CPU frequency monitoring

game/client/detailobjectsystem.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -2122,8 +2122,8 @@ int CDetailObjectSystem::SortSpritesBackToFront( int nLeaf, const Vector &viewOr
21222122
#else
21232123
#define MANTISSA_LSB_OFFSET 0
21242124
#endif
2125-
static fltx4 Four_MagicNumbers={ MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER };
2126-
static fltx4 Four_255s={ 255.0, 255.0, 255.0, 255.0 };
2125+
static fltx4 Four_MagicNumbers=FLTX4( MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER );
2126+
static fltx4 Four_255s=FLTX4( 255.0, 255.0, 255.0, 255.0 );
21272127

21282128
static ALIGN16 int32 And255Mask[4] ALIGN16_POST = {0xff,0xff,0xff,0xff};
21292129
#define PIXMASK ( * ( reinterpret_cast< fltx4 *>( &And255Mask ) ) )

inputsystem/inputsystem.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,10 @@ InitReturnVal_t CInputSystem::Init()
167167

168168
joy_xcontroller_found.SetValue( 0 );
169169

170+
#ifdef USE_SDL
170171
if( !m_bConsoleTextMode )
171172
InitializeTouch();
173+
#endif
172174

173175
if ( IsPC() && !m_bConsoleTextMode )
174176
{
@@ -975,7 +977,9 @@ void CInputSystem::SetPrimaryUserId( int userId )
975977
//-----------------------------------------------------------------------------
976978
void CInputSystem::SetRumble( float fLeftMotor, float fRightMotor, int userId )
977979
{
980+
#ifdef USE_SDL
978981
SetXDeviceRumble( fLeftMotor, fRightMotor, userId );
982+
#endif
979983
}
980984

981985

0 commit comments

Comments
 (0)