Skip to content

Commit

Permalink
Most of the SIMD is done. No wasm yet.
Browse files Browse the repository at this point in the history
  • Loading branch information
aous72 committed Nov 5, 2024
1 parent eafb965 commit 5f99c89
Show file tree
Hide file tree
Showing 20 changed files with 1,834 additions and 280 deletions.
6 changes: 2 additions & 4 deletions src/core/codestream/ojph_codeblock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ namespace ojph {
cb_size.w);
}
else
this->codeblock_functions.mem_clear32(dp, cb_size.w * sizeof(ui32));
this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(ui32));
}
else
{
Expand All @@ -259,9 +259,7 @@ namespace ojph {
cb_size.w);
}
else
this->codeblock_functions.mem_clear64(dp, cb_size.w * sizeof(*dp));


this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp));
}

++cur_line;
Expand Down
41 changes: 28 additions & 13 deletions src/core/codestream/ojph_codeblock_fun.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,10 @@ namespace ojph {
{

//////////////////////////////////////////////////////////////////////////
void gen_mem_clear32(si32* addr, size_t count);
void sse_mem_clear32(si32* addr, size_t count);
void avx_mem_clear32(si32* addr, size_t count);
void wasm_mem_clear32(si32* addr, size_t count);

void gen_mem_clear64(si64* addr, size_t count);
void sse_mem_clear64(si64* addr, size_t count);
void avx_mem_clear64(si64* addr, size_t count);
void wasm_mem_clear64(si64* addr, size_t count);
void gen_mem_clear(void* addr, size_t count);
void sse_mem_clear(void* addr, size_t count);
void avx_mem_clear(void* addr, size_t count);
void wasm_mem_clear(void* addr, size_t count);

//////////////////////////////////////////////////////////////////////////
ui32 gen_find_max_val32(ui32* address);
Expand Down Expand Up @@ -135,7 +130,7 @@ namespace ojph {
// Default path, no acceleration. We may change this later
decode_cb32 = ojph_decode_codeblock32;
find_max_val32 = gen_find_max_val32;
mem_clear32 = gen_mem_clear32;
mem_clear = gen_mem_clear;
if (reversible) {
tx_to_cb32 = gen_rev_tx_to_cb32;
tx_from_cb32 = gen_rev_tx_from_cb32;
Expand All @@ -149,7 +144,6 @@ namespace ojph {

decode_cb64 = ojph_decode_codeblock64;
find_max_val64 = gen_find_max_val64;
mem_clear64 = gen_mem_clear64;
if (reversible) {
tx_to_cb64 = gen_rev_tx_to_cb64;
tx_from_cb64 = gen_rev_tx_from_cb64;
Expand All @@ -168,7 +162,7 @@ namespace ojph {
// Accelerated functions for INTEL/AMD CPUs
#ifndef OJPH_DISABLE_SSE
if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE)
mem_clear32 = sse_mem_clear32;
mem_clear = sse_mem_clear;
#endif // !OJPH_DISABLE_SSE

#ifndef OJPH_DISABLE_SSE2
Expand All @@ -182,6 +176,16 @@ namespace ojph {
tx_to_cb32 = sse2_irv_tx_to_cb32;
tx_from_cb32 = sse2_irv_tx_from_cb32;
}
find_max_val64 = sse2_find_max_val64;
if (reversible) {
tx_to_cb64 = sse2_rev_tx_to_cb64;
tx_from_cb64 = sse2_rev_tx_from_cb64;
}
else
{
tx_to_cb64 = NULL;
tx_from_cb64 = NULL;
}
}
#endif // !OJPH_DISABLE_SSE2

Expand All @@ -192,7 +196,7 @@ namespace ojph {

#ifndef OJPH_DISABLE_AVX
if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX)
mem_clear32 = avx_mem_clear32;
mem_clear = avx_mem_clear;
#endif // !OJPH_DISABLE_AVX

#ifndef OJPH_DISABLE_AVX2
Expand All @@ -208,6 +212,17 @@ namespace ojph {
}
encode_cb32 = ojph_encode_codeblock_avx2;
decode_cb32 = ojph_decode_codeblock_avx2;

find_max_val64 = avx2_find_max_val64;
if (reversible) {
tx_to_cb64 = avx2_rev_tx_to_cb64;
tx_from_cb64 = avx2_rev_tx_from_cb64;
}
else
{
tx_to_cb64 = NULL;
tx_from_cb64 = NULL;
}
}
#endif // !OJPH_DISABLE_AVX2

Expand Down
6 changes: 2 additions & 4 deletions src/core/codestream/ojph_codeblock_fun.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ namespace ojph {
namespace local {

// define function signature simple memory clearing
typedef void (*mem_clear_fun32)(si32* addr, size_t count);
typedef void (*mem_clear_fun64)(si64* addr, size_t count);
typedef void (*mem_clear_fun)(void* addr, size_t count);

// define function signature for max value finding
typedef ui32 (*find_max_val_fun32)(ui32* addr);
Expand Down Expand Up @@ -96,8 +95,7 @@ namespace ojph {
void init(bool reversible);

// a pointer to the max value finding function
mem_clear_fun32 mem_clear32;
mem_clear_fun64 mem_clear64;
mem_clear_fun mem_clear;

// a pointer to the max value finding function
find_max_val_fun32 find_max_val32;
Expand Down
2 changes: 1 addition & 1 deletion src/core/codestream/ojph_codestream_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ namespace ojph {
namespace local {

//////////////////////////////////////////////////////////////////////////
void avx_mem_clear32(si32* addr, size_t count)
void avx_mem_clear(void* addr, size_t count)
{
float* p = (float*)addr;
__m256 zero = _mm256_setzero_ps();
Expand Down
79 changes: 72 additions & 7 deletions src/core/codestream/ojph_codestream_avx2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,18 @@ namespace ojph {
return t;
}

//////////////////////////////////////////////////////////////////////////
ui64 avx2_find_max_val64(ui64* address)
{
__m128i x0 = _mm_loadu_si128((__m128i*)address);
__m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
x0 = _mm_or_si128(x0, x1);
x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
x0 = _mm_or_si128(x0, x1);
ui64 t = (ui64)_mm_extract_epi64(x0, 0);
return t;
}

//////////////////////////////////////////////////////////////////////////
void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
float delta_inv, ui32 count, ui32* max_val)
Expand All @@ -78,7 +90,7 @@ namespace ojph {
}
_mm256_storeu_si256((__m256i*)max_val, tmax);
}

//////////////////////////////////////////////////////////////////////////
void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
float delta_inv, ui32 count, ui32* max_val)
Expand Down Expand Up @@ -115,11 +127,11 @@ namespace ojph {
si32 *p = (si32*)dp;
for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
{
__m256i v = _mm256_load_si256((__m256i*)sp);
__m256i val = _mm256_and_si256(v, m1);
val = _mm256_srli_epi32(val, (int)shift);
val = _mm256_sign_epi32(val, v);
_mm256_storeu_si256((__m256i*)p, val);
__m256i v = _mm256_load_si256((__m256i*)sp);
__m256i val = _mm256_and_si256(v, m1);
val = _mm256_srli_epi32(val, (int)shift);
val = _mm256_sign_epi32(val, v);
_mm256_storeu_si256((__m256i*)p, val);
}
}

Expand All @@ -142,5 +154,58 @@ namespace ojph {
_mm256_storeu_ps(p, valf);
}
}

//////////////////////////////////////////////////////////////////////////
void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
float delta_inv, ui32 count, ui64* max_val)
{
ojph_unused(delta_inv);

// convert to sign and magnitude and keep max_val
ui32 shift = 63 - K_max;
__m256i m0 = _mm256_set1_epi64x(0x8000000000000000LL);
__m256i zero = _mm256_setzero_si256();
__m256i one = _mm256_set1_epi64x(1);
__m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
__m256i *p = (__m256i*)sp;
for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
{
__m256i v = _mm256_loadu_si256(p);
__m256i sign = _mm256_cmpgt_epi64(zero, v);
__m256i val = _mm256_xor_si256(v, sign); // negate 1's complement
__m256i ones = _mm256_and_si256(sign, one);
val = _mm256_add_epi64(val, ones); // 2's complement
sign = _mm256_and_si256(sign, m0);
val = _mm256_slli_epi64(val, (int)shift);
tmax = _mm256_or_si256(tmax, val);
val = _mm256_or_si256(val, sign);
_mm256_storeu_si256((__m256i*)dp, val);
}
_mm256_storeu_si256((__m256i*)max_val, tmax);
}

//////////////////////////////////////////////////////////////////////////
void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
float delta, ui32 count)
{
ojph_unused(delta);

ui32 shift = 63 - K_max;
__m256i m1 = _mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
__m256i zero = _mm256_setzero_si256();
__m256i one = _mm256_set1_epi64x(1);
si64 *p = (si64*)dp;
for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
{
__m256i v = _mm256_load_si256((__m256i*)sp);
__m256i val = _mm256_and_si256(v, m1);
val = _mm256_srli_epi64(val, (int)shift);
__m256i sign = _mm256_cmpgt_epi64(zero, v);
val = _mm256_xor_si256(val, sign); // negate 1's complement
__m256i ones = _mm256_and_si256(sign, one);
val = _mm256_add_epi64(val, ones); // 2's complement
_mm256_storeu_si256((__m256i*)p, val);
}
}
}
}
}
12 changes: 3 additions & 9 deletions src/core/codestream/ojph_codestream_gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,11 @@ namespace ojph {
namespace local {

//////////////////////////////////////////////////////////////////////////
void gen_mem_clear32(si32* addr, size_t count)
{
for (size_t i = 0; i < count; i += 4)
*addr++ = 0;
}

//////////////////////////////////////////////////////////////////////////
void gen_mem_clear64(si64* addr, size_t count)
void gen_mem_clear(void* addr, size_t count)
{
si64* p = (si64*)addr;
for (size_t i = 0; i < count; i += 8)
*addr++ = 0;
*p++ = 0;
}

//////////////////////////////////////////////////////////////////////////
Expand Down
3 changes: 1 addition & 2 deletions src/core/codestream/ojph_codestream_sse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,12 @@ namespace ojph {
namespace local {

//////////////////////////////////////////////////////////////////////////
void sse_mem_clear32(si32* addr, size_t count)
void sse_mem_clear(void* addr, size_t count)
{
float* p = (float*)addr;
__m128 zero = _mm_setzero_ps();
for (size_t i = 0; i < count; i += 16, p += 4)
_mm_storeu_ps(p, zero);
}

}
}
85 changes: 77 additions & 8 deletions src/core/codestream/ojph_codestream_sse2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,21 @@ namespace ojph {
// return t;
}

//////////////////////////////////////////////////////////////////////////
ui64 sse2_find_max_val64(ui64* address)
{
__m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
x0 = _mm_or_si128(x0, x1);
_mm_storeu_si128((__m128i*)address, x0);
return *address;
// A single movd t, xmm0 can do the trick, but it is not available
// in SSE2 intrinsics. extract_epi32 is available in sse4.1
// ui32 t = (ui32)_mm_extract_epi16(x0, 0);
// t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
// return t;
}

//////////////////////////////////////////////////////////////////////////
void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
float delta_inv, ui32 count, ui32* max_val)
Expand Down Expand Up @@ -129,14 +144,14 @@ namespace ojph {
si32 *p = (si32*)dp;
for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
{
__m128i v = _mm_load_si128((__m128i*)sp);
__m128i val = _mm_and_si128(v, m1);
val = _mm_srli_epi32(val, (int)shift);
__m128i sign = _mm_cmplt_epi32(v, zero);
val = _mm_xor_si128(val, sign); // negate 1's complement
__m128i ones = _mm_and_si128(sign, one);
val = _mm_add_epi32(val, ones); // 2's complement
_mm_storeu_si128((__m128i*)p, val);
__m128i v = _mm_load_si128((__m128i*)sp);
__m128i val = _mm_and_si128(v, m1);
val = _mm_srli_epi32(val, (int)shift);
__m128i sign = _mm_cmplt_epi32(v, zero);
val = _mm_xor_si128(val, sign); // negate 1's complement
__m128i ones = _mm_and_si128(sign, one);
val = _mm_add_epi32(val, ones); // 2's complement
_mm_storeu_si128((__m128i*)p, val);
}
}

Expand All @@ -159,5 +174,59 @@ namespace ojph {
_mm_storeu_ps(p, valf);
}
}

//////////////////////////////////////////////////////////////////////////
void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
float delta_inv, ui32 count, ui64* max_val)
{
ojph_unused(delta_inv);

// convert to sign and magnitude and keep max_val
ui32 shift = 63 - K_max;
__m128i m0 = _mm_set1_epi64x(0x8000000000000000LL);
__m128i zero = _mm_setzero_si128();
__m128i one = _mm_set1_epi64x(1);
__m128i tmax = _mm_loadu_si128((__m128i*)max_val);
__m128i *p = (__m128i*)sp;
for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2)
{
__m128i v = _mm_loadu_si128(p);
__m128i sign = _mm_cmplt_epi32(v, zero);
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
__m128i val = _mm_xor_si128(v, sign); // negate 1's complement
__m128i ones = _mm_and_si128(sign, one);
val = _mm_add_epi64(val, ones); // 2's complement
sign = _mm_and_si128(sign, m0);
val = _mm_slli_epi64(val, (int)shift);
tmax = _mm_or_si128(tmax, val);
val = _mm_or_si128(val, sign);
_mm_storeu_si128((__m128i*)dp, val);
}
_mm_storeu_si128((__m128i*)max_val, tmax);
}

//////////////////////////////////////////////////////////////////////////
void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
float delta, ui32 count)
{
ojph_unused(delta);
ui32 shift = 63 - K_max;
__m128i m1 = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
__m128i zero = _mm_setzero_si128();
__m128i one = _mm_set1_epi64x(1);
si64 *p = (si64*)dp;
for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
{
__m128i v = _mm_load_si128((__m128i*)sp);
__m128i val = _mm_and_si128(v, m1);
val = _mm_srli_epi64(val, (int)shift);
__m128i sign = _mm_cmplt_epi32(v, zero);
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
val = _mm_xor_si128(val, sign); // negate 1's complement
__m128i ones = _mm_and_si128(sign, one);
val = _mm_add_epi64(val, ones); // 2's complement
_mm_storeu_si128((__m128i*)p, val);
}
}
}
}
Loading

0 comments on commit 5f99c89

Please sign in to comment.