Skip to content

Commit 5721da8

Browse files
author
Ahmet Inan
committed
added vshuf versions for SSE
1 parent cd320b4 commit 5721da8

File tree

3 files changed

+72
-0
lines changed

3 files changed

+72
-0
lines changed

sse4_1_double.hh

+22
Original file line numberDiff line numberDiff line change
@@ -1272,3 +1272,25 @@ inline SIMD<int32_t, 8> vclamp(SIMD<int32_t, 8> x, int32_t a, int32_t b)
12721272
return tmp;
12731273
}
12741274

1275+
template <>
1276+
inline SIMD<uint8_t, 32> vshuf(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b)
1277+
{
1278+
SIMD<uint8_t, 32> tmp;
1279+
for (int i = 0; i < 2; ++i)
1280+
tmp.m[i] = _mm_or_si128(
1281+
_mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))),
1282+
_mm_shuffle_epi8(a.m[1], _mm_sub_epi8(b.m[i], _mm_set1_epi8(16))));
1283+
return tmp;
1284+
}
1285+
1286+
template <>
1287+
inline SIMD<int8_t, 32> vshuf(SIMD<int8_t, 32> a, SIMD<uint8_t, 32> b)
1288+
{
1289+
SIMD<int8_t, 32> tmp;
1290+
for (int i = 0; i < 2; ++i)
1291+
tmp.m[i] = _mm_or_si128(
1292+
_mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))),
1293+
_mm_shuffle_epi8(a.m[1], _mm_sub_epi8(b.m[i], _mm_set1_epi8(16))));
1294+
return tmp;
1295+
}
1296+

sse4_1_quadruple.hh

+26
Original file line numberDiff line numberDiff line change
@@ -1272,3 +1272,29 @@ inline SIMD<int32_t, 16> vclamp(SIMD<int32_t, 16> x, int32_t a, int32_t b)
12721272
return tmp;
12731273
}
12741274

1275+
template <>
1276+
inline SIMD<uint8_t, 64> vshuf(SIMD<uint8_t, 64> a, SIMD<uint8_t, 64> b)
1277+
{
1278+
SIMD<uint8_t, 64> tmp;
1279+
for (int i = 0; i < 4; ++i)
1280+
tmp.m[i] = _mm_or_si128(_mm_or_si128(_mm_or_si128(
1281+
_mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))),
1282+
_mm_shuffle_epi8(a.m[1], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(16)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(31))))),
1283+
_mm_shuffle_epi8(a.m[2], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(32)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(47))))),
1284+
_mm_shuffle_epi8(a.m[3], _mm_sub_epi8(b.m[i], _mm_set1_epi8(48))));
1285+
return tmp;
1286+
}
1287+
1288+
template <>
1289+
inline SIMD<int8_t, 64> vshuf(SIMD<int8_t, 64> a, SIMD<uint8_t, 64> b)
1290+
{
1291+
SIMD<int8_t, 64> tmp;
1292+
for (int i = 0; i < 4; ++i)
1293+
tmp.m[i] = _mm_or_si128(_mm_or_si128(_mm_or_si128(
1294+
_mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))),
1295+
_mm_shuffle_epi8(a.m[1], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(16)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(31))))),
1296+
_mm_shuffle_epi8(a.m[2], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(32)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(47))))),
1297+
_mm_shuffle_epi8(a.m[3], _mm_sub_epi8(b.m[i], _mm_set1_epi8(48))));
1298+
return tmp;
1299+
}
1300+

sse4_1_triple.hh

+24
Original file line numberDiff line numberDiff line change
@@ -1272,3 +1272,27 @@ inline SIMD<int32_t, 12> vclamp(SIMD<int32_t, 12> x, int32_t a, int32_t b)
12721272
return tmp;
12731273
}
12741274

1275+
template <>
1276+
inline SIMD<uint8_t, 48> vshuf(SIMD<uint8_t, 48> a, SIMD<uint8_t, 48> b)
1277+
{
1278+
SIMD<uint8_t, 48> tmp;
1279+
for (int i = 0; i < 3; ++i)
1280+
tmp.m[i] = _mm_or_si128(_mm_or_si128(
1281+
_mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))),
1282+
_mm_shuffle_epi8(a.m[1], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(16)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(31))))),
1283+
_mm_shuffle_epi8(a.m[2], _mm_sub_epi8(b.m[i], _mm_set1_epi8(32))));
1284+
return tmp;
1285+
}
1286+
1287+
template <>
1288+
inline SIMD<int8_t, 48> vshuf(SIMD<int8_t, 48> a, SIMD<uint8_t, 48> b)
1289+
{
1290+
SIMD<int8_t, 48> tmp;
1291+
for (int i = 0; i < 3; ++i)
1292+
tmp.m[i] = _mm_or_si128(_mm_or_si128(
1293+
_mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))),
1294+
_mm_shuffle_epi8(a.m[1], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(16)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(31))))),
1295+
_mm_shuffle_epi8(a.m[2], _mm_sub_epi8(b.m[i], _mm_set1_epi8(32))));
1296+
return tmp;
1297+
}
1298+

0 commit comments

Comments
 (0)