@@ -1272,3 +1272,29 @@ inline SIMD<int32_t, 16> vclamp(SIMD<int32_t, 16> x, int32_t a, int32_t b)
1272
1272
return tmp;
1273
1273
}
1274
1274
1275
+ template <>
1276
+ inline SIMD<uint8_t , 64 > vshuf (SIMD<uint8_t , 64 > a, SIMD<uint8_t , 64 > b)
1277
+ {
1278
+ SIMD<uint8_t , 64 > tmp;
1279
+ for (int i = 0 ; i < 4 ; ++i)
1280
+ tmp.m [i] = _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (
1281
+ _mm_shuffle_epi8 (a.m [0 ], _mm_or_si128 (b.m [i], _mm_cmpgt_epi8 (b.m [i], _mm_set1_epi8 (15 )))),
1282
+ _mm_shuffle_epi8 (a.m [1 ], _mm_or_si128 (_mm_sub_epi8 (b.m [i], _mm_set1_epi8 (16 )), _mm_cmpgt_epi8 (b.m [i], _mm_set1_epi8 (31 ))))),
1283
+ _mm_shuffle_epi8 (a.m [2 ], _mm_or_si128 (_mm_sub_epi8 (b.m [i], _mm_set1_epi8 (32 )), _mm_cmpgt_epi8 (b.m [i], _mm_set1_epi8 (47 ))))),
1284
+ _mm_shuffle_epi8 (a.m [3 ], _mm_sub_epi8 (b.m [i], _mm_set1_epi8 (48 ))));
1285
+ return tmp;
1286
+ }
1287
+
1288
+ template <>
1289
+ inline SIMD<int8_t , 64 > vshuf (SIMD<int8_t , 64 > a, SIMD<uint8_t , 64 > b)
1290
+ {
1291
+ SIMD<int8_t , 64 > tmp;
1292
+ for (int i = 0 ; i < 4 ; ++i)
1293
+ tmp.m [i] = _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (
1294
+ _mm_shuffle_epi8 (a.m [0 ], _mm_or_si128 (b.m [i], _mm_cmpgt_epi8 (b.m [i], _mm_set1_epi8 (15 )))),
1295
+ _mm_shuffle_epi8 (a.m [1 ], _mm_or_si128 (_mm_sub_epi8 (b.m [i], _mm_set1_epi8 (16 )), _mm_cmpgt_epi8 (b.m [i], _mm_set1_epi8 (31 ))))),
1296
+ _mm_shuffle_epi8 (a.m [2 ], _mm_or_si128 (_mm_sub_epi8 (b.m [i], _mm_set1_epi8 (32 )), _mm_cmpgt_epi8 (b.m [i], _mm_set1_epi8 (47 ))))),
1297
+ _mm_shuffle_epi8 (a.m [3 ], _mm_sub_epi8 (b.m [i], _mm_set1_epi8 (48 ))));
1298
+ return tmp;
1299
+ }
1300
+
0 commit comments