Skip to content

Commit

Permalink
Forgot to commit these as well.
Browse files Browse the repository at this point in the history
  • Loading branch information
aous72 committed May 25, 2022
1 parent 2ba5389 commit 02f29d3
Show file tree
Hide file tree
Showing 7 changed files with 3 additions and 245 deletions.
6 changes: 0 additions & 6 deletions src/apps/common/ojph_img_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,15 +346,9 @@ namespace ojph {
void avx2_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1,
const line_buf *ln2, void *dp,
int bit_depth, int count);
void avx2_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1,
const line_buf *ln2, void *dp,
int bit_depth, int count);
void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
const line_buf *ln2, void *dp,
int bit_depth, int count);
void avx2_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1,
const line_buf *ln2, void *dp,
int bit_depth, int count);

////////////////////////////////////////////////////////////////////////////
//
Expand Down
2 changes: 1 addition & 1 deletion src/apps/others/ojph_img_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ namespace ojph {
if (num_components == 1)
converter = avx2_cvrt_32b1c_to_16ub1c_be;
else
converter = avx2_cvrt_32b3c_to_16ub3c_be;
{ } // did not find an implementation better than sse41
}
}

Expand Down
236 changes: 0 additions & 236 deletions src/apps/others/ojph_img_io_avx2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,124 +275,6 @@ namespace ojph {
}
}

/////////////////////////////////////////////////////////////////////////////
void avx2_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1,
const line_buf *ln2, void *dp,
int bit_depth, int count)
{
const si32 *sp0 = ln0->i32;
const si32 *sp1 = ln1->i32;
const si32 *sp2 = ln2->i32;
ui16* p = (ui16*)dp;

__m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
__m256i zero = _mm256_setzero_si256();

__m256i m0 = _mm256_set_epi64x(0x0B0A0908FFFF0706, 0x0504FFFF03020100,
0x0B0A0908FFFF0706, 0x0504FFFF03020100);
__m256i m1 = _mm256_set_epi64x(0xFFFFFFFF0504FFFF, 0xFFFF0100FFFFFFFF,
0xFFFFFFFF0504FFFF, 0xFFFF0100FFFFFFFF);
__m256i m2 = _mm256_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0F0E0D0CFFFF,
0xFFFFFFFFFFFFFFFF, 0xFFFF0F0E0D0CFFFF);
__m256i m3 = _mm256_set_epi64x(0x0706FFFFFFFF0302, 0x0D0CFFFFFFFF0908,
0x0706FFFFFFFF0302, 0x0D0CFFFFFFFF0908);
__m256i m4 = _mm256_set_epi64x(0xFFFF03020100FFFF, 0xFFFFFFFFFFFFFFFF,
0xFFFF03020100FFFF, 0xFFFFFFFFFFFFFFFF);
__m256i m5 = _mm256_set_epi64x(0xFFFFFFFF0F0EFFFF, 0xFFFF0B0AFFFFFFFF,
0xFFFFFFFF0F0EFFFF, 0xFFFF0B0AFFFFFFFF);
__m256i m6 = _mm256_set_epi64x(0x0F0E0D0CFFFF0B0A, 0x0908FFFF07060504,
0x0F0E0D0CFFFF0B0A, 0x0908FFFF07060504);

// 24 entries in each loop
for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
{
__m256i a, b, t, u, v;
a = _mm256_load_si256((__m256i*)sp0);
a = _mm256_max_epi32(a, zero);
t = _mm256_min_epi32(a, max_val_vec);

a = _mm256_load_si256((__m256i*)sp1);
a = _mm256_max_epi32(a, zero);
a = _mm256_min_epi32(a, max_val_vec);
a = _mm256_slli_epi32(a, 16);
t = _mm256_or_si256(t, a);

a = _mm256_load_si256((__m256i*)sp2);
a = _mm256_max_epi32(a, zero);
u = _mm256_min_epi32(a, max_val_vec);

a = _mm256_load_si256((__m256i*)sp0 + 1);
a = _mm256_max_epi32(a, zero);
a = _mm256_min_epi32(a, max_val_vec);
a = _mm256_slli_epi32(a, 16);
u = _mm256_or_si256(u, a);

a = _mm256_load_si256((__m256i*)sp1 + 1);
a = _mm256_max_epi32(a, zero);
v = _mm256_min_epi32(a, max_val_vec);

a = _mm256_load_si256((__m256i*)sp2 + 1);
a = _mm256_max_epi32(a, zero);
a = _mm256_min_epi32(a, max_val_vec);
a = _mm256_slli_epi32(a, 16);
v = _mm256_or_si256(v, a);

// start combining using the sse41 method
__m256i xt, xu, xv;

a = _mm256_shuffle_epi8(t, m0);
b = _mm256_shuffle_epi8(u, m1);
xt = _mm256_or_si256(a, b);

a = _mm256_shuffle_epi8(t, m2);
b = _mm256_shuffle_epi8(u, m3);
a = _mm256_or_si256(a, b);
b = _mm256_shuffle_epi8(v, m4);
xu = _mm256_or_si256(a, b);

a = _mm256_shuffle_epi8(u, m5);
b = _mm256_shuffle_epi8(v, m6);
xv = _mm256_or_si256(a, b);

// reorder them in the correct order
t = _mm256_set_epi64x(_mm256_extract_epi64(xt, 2),
_mm256_extract_epi64(xu, 0),
_mm256_extract_epi64(xt, 1),
_mm256_extract_epi64(xt, 0));
_mm256_storeu_si256((__m256i*)p , t);

t = _mm256_set_epi64x(_mm256_extract_epi64(xv, 0),
_mm256_extract_epi64(xu, 1),
_mm256_extract_epi64(xu, 2),
_mm256_extract_epi64(xt, 3));
_mm256_storeu_si256((__m256i*)p + 1, t);

t = _mm256_set_epi64x(_mm256_extract_epi64(xv, 3),
_mm256_extract_epi64(xv, 2),
_mm256_extract_epi64(xu, 3),
_mm256_extract_epi64(xv, 1));
_mm256_storeu_si256((__m256i*)p + 2, t);
}

int max_val = (1<<bit_depth) - 1;
for ( ; count > 0; --count)
{
int val;
val = *sp0++;
val = val >= 0 ? val : 0;
val = val <= max_val ? val : max_val;
*p++ = be2le((ui16) val);
val = *sp1++;
val = val >= 0 ? val : 0;
val = val <= max_val ? val : max_val;
*p++ = be2le((ui16) val);
val = *sp2++;
val = val >= 0 ? val : 0;
val = val <= max_val ? val : max_val;
*p++ = (ui16) val;
}
}

/////////////////////////////////////////////////////////////////////////////
void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
const line_buf *ln2, void *dp,
Expand Down Expand Up @@ -436,122 +318,4 @@ namespace ojph {
*p++ = be2le((ui16) val);
}
}

/////////////////////////////////////////////////////////////////////////////
void avx2_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1,
const line_buf *ln2, void *dp,
int bit_depth, int count)
{
const si32 *sp0 = ln0->i32;
const si32 *sp1 = ln1->i32;
const si32 *sp2 = ln2->i32;
ui16* p = (ui16*)dp;

__m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
__m256i zero = _mm256_setzero_si256();

__m256i m0 = _mm256_set_epi64x(0x0A0B0809FFFF0607, 0x0405FFFF02030001,
0x0A0B0809FFFF0607, 0x0405FFFF02030001);
__m256i m1 = _mm256_set_epi64x(0xFFFFFFFF0405FFFF, 0xFFFF0001FFFFFFFF,
0xFFFFFFFF0405FFFF, 0xFFFF0001FFFFFFFF);
__m256i m2 = _mm256_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0E0F0C0DFFFF,
0xFFFFFFFFFFFFFFFF, 0xFFFF0E0F0C0DFFFF);
__m256i m3 = _mm256_set_epi64x(0x0607FFFFFFFF0203, 0x0C0DFFFFFFFF0809,
0x0607FFFFFFFF0203, 0x0C0DFFFFFFFF0809);
__m256i m4 = _mm256_set_epi64x(0xFFFF02030001FFFF, 0xFFFFFFFFFFFFFFFF,
0xFFFF02030001FFFF, 0xFFFFFFFFFFFFFFFF);
__m256i m5 = _mm256_set_epi64x(0xFFFFFFFF0E0FFFFF, 0xFFFF0A0BFFFFFFFF,
0xFFFFFFFF0E0FFFFF, 0xFFFF0A0BFFFFFFFF);
__m256i m6 = _mm256_set_epi64x(0x0E0F0C0DFFFF0A0B, 0x0809FFFF06070405,
0x0E0F0C0DFFFF0A0B, 0x0809FFFF06070405);

// 24 entries in each loop
for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
{
__m256i a, b, t, u, v;
a = _mm256_load_si256((__m256i*)sp0);
a = _mm256_max_epi32(a, zero);
t = _mm256_min_epi32(a, max_val_vec);

a = _mm256_load_si256((__m256i*)sp1);
a = _mm256_max_epi32(a, zero);
a = _mm256_min_epi32(a, max_val_vec);
a = _mm256_slli_epi32(a, 16);
t = _mm256_or_si256(t, a);

a = _mm256_load_si256((__m256i*)sp2);
a = _mm256_max_epi32(a, zero);
u = _mm256_min_epi32(a, max_val_vec);

a = _mm256_load_si256((__m256i*)sp0 + 1);
a = _mm256_max_epi32(a, zero);
a = _mm256_min_epi32(a, max_val_vec);
a = _mm256_slli_epi32(a, 16);
u = _mm256_or_si256(u, a);

a = _mm256_load_si256((__m256i*)sp1 + 1);
a = _mm256_max_epi32(a, zero);
v = _mm256_min_epi32(a, max_val_vec);

a = _mm256_load_si256((__m256i*)sp2 + 1);
a = _mm256_max_epi32(a, zero);
a = _mm256_min_epi32(a, max_val_vec);
a = _mm256_slli_epi32(a, 16);
v = _mm256_or_si256(v, a);

// start combining using the sse41 method
__m256i xt, xu, xv;

a = _mm256_shuffle_epi8(t, m0);
b = _mm256_shuffle_epi8(u, m1);
xt = _mm256_or_si256(a, b);

a = _mm256_shuffle_epi8(t, m2);
b = _mm256_shuffle_epi8(u, m3);
a = _mm256_or_si256(a, b);
b = _mm256_shuffle_epi8(v, m4);
xu = _mm256_or_si256(a, b);

a = _mm256_shuffle_epi8(u, m5);
b = _mm256_shuffle_epi8(v, m6);
xv = _mm256_or_si256(a, b);

// reorder them in the correct order
t = _mm256_set_epi64x(_mm256_extract_epi64(xt, 2),
_mm256_extract_epi64(xu, 0),
_mm256_extract_epi64(xt, 1),
_mm256_extract_epi64(xt, 0));
_mm256_storeu_si256((__m256i*)p , t);

t = _mm256_set_epi64x(_mm256_extract_epi64(xv, 0),
_mm256_extract_epi64(xu, 1),
_mm256_extract_epi64(xu, 2),
_mm256_extract_epi64(xt, 3));
_mm256_storeu_si256((__m256i*)p + 1, t);

t = _mm256_set_epi64x(_mm256_extract_epi64(xv, 3),
_mm256_extract_epi64(xv, 2),
_mm256_extract_epi64(xu, 3),
_mm256_extract_epi64(xv, 1));
_mm256_storeu_si256((__m256i*)p + 2, t);
}

int max_val = (1<<bit_depth) - 1;
for ( ; count > 0; --count)
{
int val;
val = *sp0++;
val = val >= 0 ? val : 0;
val = val <= max_val ? val : max_val;
*p++ = be2le((ui16) val);
val = *sp1++;
val = val >= 0 ? val : 0;
val = val <= max_val ? val : max_val;
*p++ = be2le((ui16) val);
val = *sp2++;
val = val >= 0 ? val : 0;
val = val <= max_val ? val : max_val;
*p++ = be2le((ui16) val);
}
}
}
Loading

0 comments on commit 02f29d3

Please sign in to comment.