Skip to content

Commit 02f29d3

Browse files
committed
Forgot to commit these as well.
1 parent 2ba5389 commit 02f29d3

File tree

7 files changed

+3
-245
lines changed

7 files changed

+3
-245
lines changed

src/apps/common/ojph_img_io.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -346,15 +346,9 @@ namespace ojph {
346346
void avx2_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1,
347347
const line_buf *ln2, void *dp,
348348
int bit_depth, int count);
349-
void avx2_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1,
350-
const line_buf *ln2, void *dp,
351-
int bit_depth, int count);
352349
void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
353350
const line_buf *ln2, void *dp,
354351
int bit_depth, int count);
355-
void avx2_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1,
356-
const line_buf *ln2, void *dp,
357-
int bit_depth, int count);
358352

359353
////////////////////////////////////////////////////////////////////////////
360354
//

src/apps/others/ojph_img_io.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ namespace ojph {
489489
if (num_components == 1)
490490
converter = avx2_cvrt_32b1c_to_16ub1c_be;
491491
else
492-
converter = avx2_cvrt_32b3c_to_16ub3c_be;
492+
{ } // did not find an implementation better than sse41
493493
}
494494
}
495495

src/apps/others/ojph_img_io_avx2.cpp

Lines changed: 0 additions & 236 deletions
Original file line numberDiff line numberDiff line change
@@ -275,124 +275,6 @@ namespace ojph {
275275
}
276276
}
277277

278-
/////////////////////////////////////////////////////////////////////////////
279-
void avx2_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1,
280-
const line_buf *ln2, void *dp,
281-
int bit_depth, int count)
282-
{
283-
const si32 *sp0 = ln0->i32;
284-
const si32 *sp1 = ln1->i32;
285-
const si32 *sp2 = ln2->i32;
286-
ui16* p = (ui16*)dp;
287-
288-
__m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
289-
__m256i zero = _mm256_setzero_si256();
290-
291-
__m256i m0 = _mm256_set_epi64x(0x0B0A0908FFFF0706, 0x0504FFFF03020100,
292-
0x0B0A0908FFFF0706, 0x0504FFFF03020100);
293-
__m256i m1 = _mm256_set_epi64x(0xFFFFFFFF0504FFFF, 0xFFFF0100FFFFFFFF,
294-
0xFFFFFFFF0504FFFF, 0xFFFF0100FFFFFFFF);
295-
__m256i m2 = _mm256_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0F0E0D0CFFFF,
296-
0xFFFFFFFFFFFFFFFF, 0xFFFF0F0E0D0CFFFF);
297-
__m256i m3 = _mm256_set_epi64x(0x0706FFFFFFFF0302, 0x0D0CFFFFFFFF0908,
298-
0x0706FFFFFFFF0302, 0x0D0CFFFFFFFF0908);
299-
__m256i m4 = _mm256_set_epi64x(0xFFFF03020100FFFF, 0xFFFFFFFFFFFFFFFF,
300-
0xFFFF03020100FFFF, 0xFFFFFFFFFFFFFFFF);
301-
__m256i m5 = _mm256_set_epi64x(0xFFFFFFFF0F0EFFFF, 0xFFFF0B0AFFFFFFFF,
302-
0xFFFFFFFF0F0EFFFF, 0xFFFF0B0AFFFFFFFF);
303-
__m256i m6 = _mm256_set_epi64x(0x0F0E0D0CFFFF0B0A, 0x0908FFFF07060504,
304-
0x0F0E0D0CFFFF0B0A, 0x0908FFFF07060504);
305-
306-
// 24 entries in each loop
307-
for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
308-
{
309-
__m256i a, b, t, u, v;
310-
a = _mm256_load_si256((__m256i*)sp0);
311-
a = _mm256_max_epi32(a, zero);
312-
t = _mm256_min_epi32(a, max_val_vec);
313-
314-
a = _mm256_load_si256((__m256i*)sp1);
315-
a = _mm256_max_epi32(a, zero);
316-
a = _mm256_min_epi32(a, max_val_vec);
317-
a = _mm256_slli_epi32(a, 16);
318-
t = _mm256_or_si256(t, a);
319-
320-
a = _mm256_load_si256((__m256i*)sp2);
321-
a = _mm256_max_epi32(a, zero);
322-
u = _mm256_min_epi32(a, max_val_vec);
323-
324-
a = _mm256_load_si256((__m256i*)sp0 + 1);
325-
a = _mm256_max_epi32(a, zero);
326-
a = _mm256_min_epi32(a, max_val_vec);
327-
a = _mm256_slli_epi32(a, 16);
328-
u = _mm256_or_si256(u, a);
329-
330-
a = _mm256_load_si256((__m256i*)sp1 + 1);
331-
a = _mm256_max_epi32(a, zero);
332-
v = _mm256_min_epi32(a, max_val_vec);
333-
334-
a = _mm256_load_si256((__m256i*)sp2 + 1);
335-
a = _mm256_max_epi32(a, zero);
336-
a = _mm256_min_epi32(a, max_val_vec);
337-
a = _mm256_slli_epi32(a, 16);
338-
v = _mm256_or_si256(v, a);
339-
340-
// start combining using the sse41 method
341-
__m256i xt, xu, xv;
342-
343-
a = _mm256_shuffle_epi8(t, m0);
344-
b = _mm256_shuffle_epi8(u, m1);
345-
xt = _mm256_or_si256(a, b);
346-
347-
a = _mm256_shuffle_epi8(t, m2);
348-
b = _mm256_shuffle_epi8(u, m3);
349-
a = _mm256_or_si256(a, b);
350-
b = _mm256_shuffle_epi8(v, m4);
351-
xu = _mm256_or_si256(a, b);
352-
353-
a = _mm256_shuffle_epi8(u, m5);
354-
b = _mm256_shuffle_epi8(v, m6);
355-
xv = _mm256_or_si256(a, b);
356-
357-
// reorder them in the correct order
358-
t = _mm256_set_epi64x(_mm256_extract_epi64(xt, 2),
359-
_mm256_extract_epi64(xu, 0),
360-
_mm256_extract_epi64(xt, 1),
361-
_mm256_extract_epi64(xt, 0));
362-
_mm256_storeu_si256((__m256i*)p , t);
363-
364-
t = _mm256_set_epi64x(_mm256_extract_epi64(xv, 0),
365-
_mm256_extract_epi64(xu, 1),
366-
_mm256_extract_epi64(xu, 2),
367-
_mm256_extract_epi64(xt, 3));
368-
_mm256_storeu_si256((__m256i*)p + 1, t);
369-
370-
t = _mm256_set_epi64x(_mm256_extract_epi64(xv, 3),
371-
_mm256_extract_epi64(xv, 2),
372-
_mm256_extract_epi64(xu, 3),
373-
_mm256_extract_epi64(xv, 1));
374-
_mm256_storeu_si256((__m256i*)p + 2, t);
375-
}
376-
377-
int max_val = (1<<bit_depth) - 1;
378-
for ( ; count > 0; --count)
379-
{
380-
int val;
381-
val = *sp0++;
382-
val = val >= 0 ? val : 0;
383-
val = val <= max_val ? val : max_val;
384-
*p++ = be2le((ui16) val);
385-
val = *sp1++;
386-
val = val >= 0 ? val : 0;
387-
val = val <= max_val ? val : max_val;
388-
*p++ = be2le((ui16) val);
389-
val = *sp2++;
390-
val = val >= 0 ? val : 0;
391-
val = val <= max_val ? val : max_val;
392-
*p++ = (ui16) val;
393-
}
394-
}
395-
396278
/////////////////////////////////////////////////////////////////////////////
397279
void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
398280
const line_buf *ln2, void *dp,
@@ -436,122 +318,4 @@ namespace ojph {
436318
*p++ = be2le((ui16) val);
437319
}
438320
}
439-
440-
/////////////////////////////////////////////////////////////////////////////
441-
void avx2_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1,
442-
const line_buf *ln2, void *dp,
443-
int bit_depth, int count)
444-
{
445-
const si32 *sp0 = ln0->i32;
446-
const si32 *sp1 = ln1->i32;
447-
const si32 *sp2 = ln2->i32;
448-
ui16* p = (ui16*)dp;
449-
450-
__m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
451-
__m256i zero = _mm256_setzero_si256();
452-
453-
__m256i m0 = _mm256_set_epi64x(0x0A0B0809FFFF0607, 0x0405FFFF02030001,
454-
0x0A0B0809FFFF0607, 0x0405FFFF02030001);
455-
__m256i m1 = _mm256_set_epi64x(0xFFFFFFFF0405FFFF, 0xFFFF0001FFFFFFFF,
456-
0xFFFFFFFF0405FFFF, 0xFFFF0001FFFFFFFF);
457-
__m256i m2 = _mm256_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0E0F0C0DFFFF,
458-
0xFFFFFFFFFFFFFFFF, 0xFFFF0E0F0C0DFFFF);
459-
__m256i m3 = _mm256_set_epi64x(0x0607FFFFFFFF0203, 0x0C0DFFFFFFFF0809,
460-
0x0607FFFFFFFF0203, 0x0C0DFFFFFFFF0809);
461-
__m256i m4 = _mm256_set_epi64x(0xFFFF02030001FFFF, 0xFFFFFFFFFFFFFFFF,
462-
0xFFFF02030001FFFF, 0xFFFFFFFFFFFFFFFF);
463-
__m256i m5 = _mm256_set_epi64x(0xFFFFFFFF0E0FFFFF, 0xFFFF0A0BFFFFFFFF,
464-
0xFFFFFFFF0E0FFFFF, 0xFFFF0A0BFFFFFFFF);
465-
__m256i m6 = _mm256_set_epi64x(0x0E0F0C0DFFFF0A0B, 0x0809FFFF06070405,
466-
0x0E0F0C0DFFFF0A0B, 0x0809FFFF06070405);
467-
468-
// 24 entries in each loop
469-
for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
470-
{
471-
__m256i a, b, t, u, v;
472-
a = _mm256_load_si256((__m256i*)sp0);
473-
a = _mm256_max_epi32(a, zero);
474-
t = _mm256_min_epi32(a, max_val_vec);
475-
476-
a = _mm256_load_si256((__m256i*)sp1);
477-
a = _mm256_max_epi32(a, zero);
478-
a = _mm256_min_epi32(a, max_val_vec);
479-
a = _mm256_slli_epi32(a, 16);
480-
t = _mm256_or_si256(t, a);
481-
482-
a = _mm256_load_si256((__m256i*)sp2);
483-
a = _mm256_max_epi32(a, zero);
484-
u = _mm256_min_epi32(a, max_val_vec);
485-
486-
a = _mm256_load_si256((__m256i*)sp0 + 1);
487-
a = _mm256_max_epi32(a, zero);
488-
a = _mm256_min_epi32(a, max_val_vec);
489-
a = _mm256_slli_epi32(a, 16);
490-
u = _mm256_or_si256(u, a);
491-
492-
a = _mm256_load_si256((__m256i*)sp1 + 1);
493-
a = _mm256_max_epi32(a, zero);
494-
v = _mm256_min_epi32(a, max_val_vec);
495-
496-
a = _mm256_load_si256((__m256i*)sp2 + 1);
497-
a = _mm256_max_epi32(a, zero);
498-
a = _mm256_min_epi32(a, max_val_vec);
499-
a = _mm256_slli_epi32(a, 16);
500-
v = _mm256_or_si256(v, a);
501-
502-
// start combining using the sse41 method
503-
__m256i xt, xu, xv;
504-
505-
a = _mm256_shuffle_epi8(t, m0);
506-
b = _mm256_shuffle_epi8(u, m1);
507-
xt = _mm256_or_si256(a, b);
508-
509-
a = _mm256_shuffle_epi8(t, m2);
510-
b = _mm256_shuffle_epi8(u, m3);
511-
a = _mm256_or_si256(a, b);
512-
b = _mm256_shuffle_epi8(v, m4);
513-
xu = _mm256_or_si256(a, b);
514-
515-
a = _mm256_shuffle_epi8(u, m5);
516-
b = _mm256_shuffle_epi8(v, m6);
517-
xv = _mm256_or_si256(a, b);
518-
519-
// reorder them in the correct order
520-
t = _mm256_set_epi64x(_mm256_extract_epi64(xt, 2),
521-
_mm256_extract_epi64(xu, 0),
522-
_mm256_extract_epi64(xt, 1),
523-
_mm256_extract_epi64(xt, 0));
524-
_mm256_storeu_si256((__m256i*)p , t);
525-
526-
t = _mm256_set_epi64x(_mm256_extract_epi64(xv, 0),
527-
_mm256_extract_epi64(xu, 1),
528-
_mm256_extract_epi64(xu, 2),
529-
_mm256_extract_epi64(xt, 3));
530-
_mm256_storeu_si256((__m256i*)p + 1, t);
531-
532-
t = _mm256_set_epi64x(_mm256_extract_epi64(xv, 3),
533-
_mm256_extract_epi64(xv, 2),
534-
_mm256_extract_epi64(xu, 3),
535-
_mm256_extract_epi64(xv, 1));
536-
_mm256_storeu_si256((__m256i*)p + 2, t);
537-
}
538-
539-
int max_val = (1<<bit_depth) - 1;
540-
for ( ; count > 0; --count)
541-
{
542-
int val;
543-
val = *sp0++;
544-
val = val >= 0 ? val : 0;
545-
val = val <= max_val ? val : max_val;
546-
*p++ = be2le((ui16) val);
547-
val = *sp1++;
548-
val = val >= 0 ? val : 0;
549-
val = val <= max_val ? val : max_val;
550-
*p++ = be2le((ui16) val);
551-
val = *sp2++;
552-
val = val >= 0 ? val : 0;
553-
val = val <= max_val ? val : max_val;
554-
*p++ = be2le((ui16) val);
555-
}
556-
}
557321
}

0 commit comments

Comments
 (0)