@@ -275,124 +275,6 @@ namespace ojph {
275
275
}
276
276
}
277
277
278
- // ///////////////////////////////////////////////////////////////////////////
279
- void avx2_cvrt_32b3c_to_16ub3c_le (const line_buf *ln0, const line_buf *ln1,
280
- const line_buf *ln2, void *dp,
281
- int bit_depth, int count)
282
- {
283
- const si32 *sp0 = ln0->i32 ;
284
- const si32 *sp1 = ln1->i32 ;
285
- const si32 *sp2 = ln2->i32 ;
286
- ui16* p = (ui16*)dp;
287
-
288
- __m256i max_val_vec = _mm256_set1_epi32 ((1 << bit_depth) - 1 );
289
- __m256i zero = _mm256_setzero_si256 ();
290
-
291
- __m256i m0 = _mm256_set_epi64x (0x0B0A0908FFFF0706 , 0x0504FFFF03020100 ,
292
- 0x0B0A0908FFFF0706 , 0x0504FFFF03020100 );
293
- __m256i m1 = _mm256_set_epi64x (0xFFFFFFFF0504FFFF , 0xFFFF0100FFFFFFFF ,
294
- 0xFFFFFFFF0504FFFF , 0xFFFF0100FFFFFFFF );
295
- __m256i m2 = _mm256_set_epi64x (0xFFFFFFFFFFFFFFFF , 0xFFFF0F0E0D0CFFFF ,
296
- 0xFFFFFFFFFFFFFFFF , 0xFFFF0F0E0D0CFFFF );
297
- __m256i m3 = _mm256_set_epi64x (0x0706FFFFFFFF0302 , 0x0D0CFFFFFFFF0908 ,
298
- 0x0706FFFFFFFF0302 , 0x0D0CFFFFFFFF0908 );
299
- __m256i m4 = _mm256_set_epi64x (0xFFFF03020100FFFF , 0xFFFFFFFFFFFFFFFF ,
300
- 0xFFFF03020100FFFF , 0xFFFFFFFFFFFFFFFF );
301
- __m256i m5 = _mm256_set_epi64x (0xFFFFFFFF0F0EFFFF , 0xFFFF0B0AFFFFFFFF ,
302
- 0xFFFFFFFF0F0EFFFF , 0xFFFF0B0AFFFFFFFF );
303
- __m256i m6 = _mm256_set_epi64x (0x0F0E0D0CFFFF0B0A , 0x0908FFFF07060504 ,
304
- 0x0F0E0D0CFFFF0B0A , 0x0908FFFF07060504 );
305
-
306
- // 24 entries in each loop
307
- for ( ; count >= 16 ; count -= 16 , sp0 += 16 , sp1 += 16 , sp2 += 16 , p += 48 )
308
- {
309
- __m256i a, b, t, u, v;
310
- a = _mm256_load_si256 ((__m256i*)sp0);
311
- a = _mm256_max_epi32 (a, zero);
312
- t = _mm256_min_epi32 (a, max_val_vec);
313
-
314
- a = _mm256_load_si256 ((__m256i*)sp1);
315
- a = _mm256_max_epi32 (a, zero);
316
- a = _mm256_min_epi32 (a, max_val_vec);
317
- a = _mm256_slli_epi32 (a, 16 );
318
- t = _mm256_or_si256 (t, a);
319
-
320
- a = _mm256_load_si256 ((__m256i*)sp2);
321
- a = _mm256_max_epi32 (a, zero);
322
- u = _mm256_min_epi32 (a, max_val_vec);
323
-
324
- a = _mm256_load_si256 ((__m256i*)sp0 + 1 );
325
- a = _mm256_max_epi32 (a, zero);
326
- a = _mm256_min_epi32 (a, max_val_vec);
327
- a = _mm256_slli_epi32 (a, 16 );
328
- u = _mm256_or_si256 (u, a);
329
-
330
- a = _mm256_load_si256 ((__m256i*)sp1 + 1 );
331
- a = _mm256_max_epi32 (a, zero);
332
- v = _mm256_min_epi32 (a, max_val_vec);
333
-
334
- a = _mm256_load_si256 ((__m256i*)sp2 + 1 );
335
- a = _mm256_max_epi32 (a, zero);
336
- a = _mm256_min_epi32 (a, max_val_vec);
337
- a = _mm256_slli_epi32 (a, 16 );
338
- v = _mm256_or_si256 (v, a);
339
-
340
- // start combining using the sse41 method
341
- __m256i xt, xu, xv;
342
-
343
- a = _mm256_shuffle_epi8 (t, m0);
344
- b = _mm256_shuffle_epi8 (u, m1);
345
- xt = _mm256_or_si256 (a, b);
346
-
347
- a = _mm256_shuffle_epi8 (t, m2);
348
- b = _mm256_shuffle_epi8 (u, m3);
349
- a = _mm256_or_si256 (a, b);
350
- b = _mm256_shuffle_epi8 (v, m4);
351
- xu = _mm256_or_si256 (a, b);
352
-
353
- a = _mm256_shuffle_epi8 (u, m5);
354
- b = _mm256_shuffle_epi8 (v, m6);
355
- xv = _mm256_or_si256 (a, b);
356
-
357
- // reorder them in the correct order
358
- t = _mm256_set_epi64x (_mm256_extract_epi64 (xt, 2 ),
359
- _mm256_extract_epi64 (xu, 0 ),
360
- _mm256_extract_epi64 (xt, 1 ),
361
- _mm256_extract_epi64 (xt, 0 ));
362
- _mm256_storeu_si256 ((__m256i*)p , t);
363
-
364
- t = _mm256_set_epi64x (_mm256_extract_epi64 (xv, 0 ),
365
- _mm256_extract_epi64 (xu, 1 ),
366
- _mm256_extract_epi64 (xu, 2 ),
367
- _mm256_extract_epi64 (xt, 3 ));
368
- _mm256_storeu_si256 ((__m256i*)p + 1 , t);
369
-
370
- t = _mm256_set_epi64x (_mm256_extract_epi64 (xv, 3 ),
371
- _mm256_extract_epi64 (xv, 2 ),
372
- _mm256_extract_epi64 (xu, 3 ),
373
- _mm256_extract_epi64 (xv, 1 ));
374
- _mm256_storeu_si256 ((__m256i*)p + 2 , t);
375
- }
376
-
377
- int max_val = (1 <<bit_depth) - 1 ;
378
- for ( ; count > 0 ; --count)
379
- {
380
- int val;
381
- val = *sp0++;
382
- val = val >= 0 ? val : 0 ;
383
- val = val <= max_val ? val : max_val;
384
- *p++ = be2le ((ui16) val);
385
- val = *sp1++;
386
- val = val >= 0 ? val : 0 ;
387
- val = val <= max_val ? val : max_val;
388
- *p++ = be2le ((ui16) val);
389
- val = *sp2++;
390
- val = val >= 0 ? val : 0 ;
391
- val = val <= max_val ? val : max_val;
392
- *p++ = (ui16) val;
393
- }
394
- }
395
-
396
278
// ///////////////////////////////////////////////////////////////////////////
397
279
void avx2_cvrt_32b1c_to_16ub1c_be (const line_buf *ln0, const line_buf *ln1,
398
280
const line_buf *ln2, void *dp,
@@ -436,122 +318,4 @@ namespace ojph {
436
318
*p++ = be2le ((ui16) val);
437
319
}
438
320
}
439
-
440
- // ///////////////////////////////////////////////////////////////////////////
441
- void avx2_cvrt_32b3c_to_16ub3c_be (const line_buf *ln0, const line_buf *ln1,
442
- const line_buf *ln2, void *dp,
443
- int bit_depth, int count)
444
- {
445
- const si32 *sp0 = ln0->i32 ;
446
- const si32 *sp1 = ln1->i32 ;
447
- const si32 *sp2 = ln2->i32 ;
448
- ui16* p = (ui16*)dp;
449
-
450
- __m256i max_val_vec = _mm256_set1_epi32 ((1 << bit_depth) - 1 );
451
- __m256i zero = _mm256_setzero_si256 ();
452
-
453
- __m256i m0 = _mm256_set_epi64x (0x0A0B0809FFFF0607 , 0x0405FFFF02030001 ,
454
- 0x0A0B0809FFFF0607 , 0x0405FFFF02030001 );
455
- __m256i m1 = _mm256_set_epi64x (0xFFFFFFFF0405FFFF , 0xFFFF0001FFFFFFFF ,
456
- 0xFFFFFFFF0405FFFF , 0xFFFF0001FFFFFFFF );
457
- __m256i m2 = _mm256_set_epi64x (0xFFFFFFFFFFFFFFFF , 0xFFFF0E0F0C0DFFFF ,
458
- 0xFFFFFFFFFFFFFFFF , 0xFFFF0E0F0C0DFFFF );
459
- __m256i m3 = _mm256_set_epi64x (0x0607FFFFFFFF0203 , 0x0C0DFFFFFFFF0809 ,
460
- 0x0607FFFFFFFF0203 , 0x0C0DFFFFFFFF0809 );
461
- __m256i m4 = _mm256_set_epi64x (0xFFFF02030001FFFF , 0xFFFFFFFFFFFFFFFF ,
462
- 0xFFFF02030001FFFF , 0xFFFFFFFFFFFFFFFF );
463
- __m256i m5 = _mm256_set_epi64x (0xFFFFFFFF0E0FFFFF , 0xFFFF0A0BFFFFFFFF ,
464
- 0xFFFFFFFF0E0FFFFF , 0xFFFF0A0BFFFFFFFF );
465
- __m256i m6 = _mm256_set_epi64x (0x0E0F0C0DFFFF0A0B , 0x0809FFFF06070405 ,
466
- 0x0E0F0C0DFFFF0A0B , 0x0809FFFF06070405 );
467
-
468
- // 24 entries in each loop
469
- for ( ; count >= 16 ; count -= 16 , sp0 += 16 , sp1 += 16 , sp2 += 16 , p += 48 )
470
- {
471
- __m256i a, b, t, u, v;
472
- a = _mm256_load_si256 ((__m256i*)sp0);
473
- a = _mm256_max_epi32 (a, zero);
474
- t = _mm256_min_epi32 (a, max_val_vec);
475
-
476
- a = _mm256_load_si256 ((__m256i*)sp1);
477
- a = _mm256_max_epi32 (a, zero);
478
- a = _mm256_min_epi32 (a, max_val_vec);
479
- a = _mm256_slli_epi32 (a, 16 );
480
- t = _mm256_or_si256 (t, a);
481
-
482
- a = _mm256_load_si256 ((__m256i*)sp2);
483
- a = _mm256_max_epi32 (a, zero);
484
- u = _mm256_min_epi32 (a, max_val_vec);
485
-
486
- a = _mm256_load_si256 ((__m256i*)sp0 + 1 );
487
- a = _mm256_max_epi32 (a, zero);
488
- a = _mm256_min_epi32 (a, max_val_vec);
489
- a = _mm256_slli_epi32 (a, 16 );
490
- u = _mm256_or_si256 (u, a);
491
-
492
- a = _mm256_load_si256 ((__m256i*)sp1 + 1 );
493
- a = _mm256_max_epi32 (a, zero);
494
- v = _mm256_min_epi32 (a, max_val_vec);
495
-
496
- a = _mm256_load_si256 ((__m256i*)sp2 + 1 );
497
- a = _mm256_max_epi32 (a, zero);
498
- a = _mm256_min_epi32 (a, max_val_vec);
499
- a = _mm256_slli_epi32 (a, 16 );
500
- v = _mm256_or_si256 (v, a);
501
-
502
- // start combining using the sse41 method
503
- __m256i xt, xu, xv;
504
-
505
- a = _mm256_shuffle_epi8 (t, m0);
506
- b = _mm256_shuffle_epi8 (u, m1);
507
- xt = _mm256_or_si256 (a, b);
508
-
509
- a = _mm256_shuffle_epi8 (t, m2);
510
- b = _mm256_shuffle_epi8 (u, m3);
511
- a = _mm256_or_si256 (a, b);
512
- b = _mm256_shuffle_epi8 (v, m4);
513
- xu = _mm256_or_si256 (a, b);
514
-
515
- a = _mm256_shuffle_epi8 (u, m5);
516
- b = _mm256_shuffle_epi8 (v, m6);
517
- xv = _mm256_or_si256 (a, b);
518
-
519
- // reorder them in the correct order
520
- t = _mm256_set_epi64x (_mm256_extract_epi64 (xt, 2 ),
521
- _mm256_extract_epi64 (xu, 0 ),
522
- _mm256_extract_epi64 (xt, 1 ),
523
- _mm256_extract_epi64 (xt, 0 ));
524
- _mm256_storeu_si256 ((__m256i*)p , t);
525
-
526
- t = _mm256_set_epi64x (_mm256_extract_epi64 (xv, 0 ),
527
- _mm256_extract_epi64 (xu, 1 ),
528
- _mm256_extract_epi64 (xu, 2 ),
529
- _mm256_extract_epi64 (xt, 3 ));
530
- _mm256_storeu_si256 ((__m256i*)p + 1 , t);
531
-
532
- t = _mm256_set_epi64x (_mm256_extract_epi64 (xv, 3 ),
533
- _mm256_extract_epi64 (xv, 2 ),
534
- _mm256_extract_epi64 (xu, 3 ),
535
- _mm256_extract_epi64 (xv, 1 ));
536
- _mm256_storeu_si256 ((__m256i*)p + 2 , t);
537
- }
538
-
539
- int max_val = (1 <<bit_depth) - 1 ;
540
- for ( ; count > 0 ; --count)
541
- {
542
- int val;
543
- val = *sp0++;
544
- val = val >= 0 ? val : 0 ;
545
- val = val <= max_val ? val : max_val;
546
- *p++ = be2le ((ui16) val);
547
- val = *sp1++;
548
- val = val >= 0 ? val : 0 ;
549
- val = val <= max_val ? val : max_val;
550
- *p++ = be2le ((ui16) val);
551
- val = *sp2++;
552
- val = val >= 0 ? val : 0 ;
553
- val = val <= max_val ? val : max_val;
554
- *p++ = be2le ((ui16) val);
555
- }
556
- }
557
321
}
0 commit comments