From 8afe6a8ce352609b3afa6f12d80276c71a552066 Mon Sep 17 00:00:00 2001 From: OSAMU WATANABE Date: Fri, 6 Oct 2023 17:40:02 +0900 Subject: [PATCH] Improve zigzag scan --- lib/block_coding_128.cpp | 24 ++++++++++++++++-------- lib/block_coding_256.cpp | 8 ++++---- lib/block_coding_512.cpp | 4 ++-- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/lib/block_coding_128.cpp b/lib/block_coding_128.cpp index 96249e7..b012647 100644 --- a/lib/block_coding_128.cpp +++ b/lib/block_coding_128.cpp @@ -89,10 +89,14 @@ auto row4_ne_0 = VecFromMask(s16, Eq(row4, zero)); auto row5_ne_0 = VecFromMask(s16, Eq(row5, zero)); auto row6_ne_0 = VecFromMask(s16, Eq(row6, zero)); auto row7_ne_0 = VecFromMask(s16, Eq(row7, zero)); -auto row10_ne_0 = ConcatEven(u8, BitCast(u8, row0_ne_0), BitCast(u8, row1_ne_0)); -auto row32_ne_0 = ConcatEven(u8, BitCast(u8, row2_ne_0), BitCast(u8, row3_ne_0)); -auto row54_ne_0 = ConcatEven(u8, BitCast(u8, row4_ne_0), BitCast(u8, row5_ne_0)); -auto row76_ne_0 = ConcatEven(u8, BitCast(u8, row6_ne_0), BitCast(u8, row7_ne_0)); +auto row10_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row1_ne_0), BitCast(u16, row0_ne_0)); +auto row32_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row3_ne_0), BitCast(u16, row2_ne_0)); +auto row54_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row5_ne_0), BitCast(u16, row4_ne_0)); +auto row76_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row7_ne_0), BitCast(u16, row6_ne_0)); +// auto row10_ne_0 = ConcatEven(u8, BitCast(u8, row0_ne_0), BitCast(u8, row1_ne_0)); +// auto row32_ne_0 = ConcatEven(u8, BitCast(u8, row2_ne_0), BitCast(u8, row3_ne_0)); +// auto row54_ne_0 = ConcatEven(u8, BitCast(u8, row4_ne_0), BitCast(u8, row5_ne_0)); +// auto row76_ne_0 = ConcatEven(u8, BitCast(u8, row6_ne_0), BitCast(u8, row7_ne_0)); /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080}; @@ -129,10 +133,14 @@ auto row6_lz = LeadingZeroCount(abs_row6); auto row7_lz = LeadingZeroCount(abs_row7); /* Narrow leading zero count to 8 bits. */ -auto row01_lz = ConcatEven(u8, BitCast(u8, row1_lz), BitCast(u8, row0_lz)); -auto row23_lz = ConcatEven(u8, BitCast(u8, row3_lz), BitCast(u8, row2_lz)); -auto row45_lz = ConcatEven(u8, BitCast(u8, row5_lz), BitCast(u8, row4_lz)); -auto row67_lz = ConcatEven(u8, BitCast(u8, row7_lz), BitCast(u8, row6_lz)); +auto row01_lz = OrderedTruncate2To(u8, BitCast(u16, row0_lz), BitCast(u16, row1_lz)); +auto row23_lz = OrderedTruncate2To(u8, BitCast(u16, row2_lz), BitCast(u16, row3_lz)); +auto row45_lz = OrderedTruncate2To(u8, BitCast(u16, row4_lz), BitCast(u16, row5_lz)); +auto row67_lz = OrderedTruncate2To(u8, BitCast(u16, row6_lz), BitCast(u16, row7_lz)); +// auto row01_lz = ConcatEven(u8, BitCast(u8, row1_lz), BitCast(u8, row0_lz)); +// auto row23_lz = ConcatEven(u8, BitCast(u8, row3_lz), BitCast(u8, row2_lz)); +// auto row45_lz = ConcatEven(u8, BitCast(u8, row5_lz), BitCast(u8, row4_lz)); +// auto row67_lz = ConcatEven(u8, BitCast(u8, row7_lz), BitCast(u8, row6_lz)); /* Compute nbits needed to specify magnitude of each coefficient. */ const auto sixteen = Set(u8, 16); auto row01_nbits = Sub(sixteen, row01_lz); diff --git a/lib/block_coding_256.cpp b/lib/block_coding_256.cpp index f45d03f..73372b8 100644 --- a/lib/block_coding_256.cpp +++ b/lib/block_coding_256.cpp @@ -67,8 +67,8 @@ auto row01_ne_0 = VecFromMask(s16, Eq(row01, zero)); auto row23_ne_0 = VecFromMask(s16, Eq(row23, zero)); auto row45_ne_0 = VecFromMask(s16, Eq(row45, zero)); auto row67_ne_0 = VecFromMask(s16, Eq(row67, zero)); -auto row3210_ne_0 = ConcatEven(u8, BitCast(u8, row23_ne_0), BitCast(u8, row01_ne_0)); -auto row7654_ne_0 = ConcatEven(u8, BitCast(u8, row67_ne_0), BitCast(u8, row45_ne_0)); +auto row3210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row01_ne_0), BitCast(u16, row23_ne_0)); +auto row7654_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row45_ne_0), BitCast(u16, row67_ne_0)); /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080, @@ -96,8 +96,8 @@ auto row23_lz = LeadingZeroCount(abs_row23); auto row45_lz = LeadingZeroCount(abs_row45); auto row67_lz = LeadingZeroCount(abs_row67); /* Narrow leading zero count to 8 bits. */ -auto row0123_lz = ConcatEven(u8, BitCast(u8, row23_lz), BitCast(u8, row01_lz)); -auto row4567_lz = ConcatEven(u8, BitCast(u8, row67_lz), BitCast(u8, row45_lz)); +auto row0123_lz = OrderedTruncate2To(u8, BitCast(u16, row01_lz), BitCast(u16, row23_lz)); +auto row4567_lz = OrderedTruncate2To(u8, BitCast(u16, row45_lz), BitCast(u16, row67_lz)); /* Compute nbits needed to specify magnitude of each coefficient. */ auto row0123_nbits = Sub(Set(u8, 16), row0123_lz); auto row4567_nbits = Sub(Set(u8, 16), row4567_lz); diff --git a/lib/block_coding_512.cpp b/lib/block_coding_512.cpp index cec3302..b59733d 100644 --- a/lib/block_coding_512.cpp +++ b/lib/block_coding_512.cpp @@ -28,7 +28,7 @@ auto row4567 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 auto zero = Zero(s16); auto row0123_ne_0 = VecFromMask(s16, Eq(row0123, zero)); auto row4567_ne_0 = VecFromMask(s16, Eq(row4567, zero)); -auto row76543210_ne_0 = ConcatEven(u8, BitCast(u8, row4567_ne_0), BitCast(u8, row0123_ne_0)); +auto row76543210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row0123_ne_0), BitCast(u16, row4567_ne_0)); /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080, @@ -50,7 +50,7 @@ auto abs_row4567 = Abs(row4567); auto row0123_lz = LeadingZeroCount(abs_row0123); auto row4567_lz = LeadingZeroCount(abs_row4567); /* Narrow leading zero count to 8 bits. */ -auto row01234567_lz = ConcatEven(u8, BitCast(u8, row4567_lz), BitCast(u8, row0123_lz)); +auto row01234567_lz = OrderedTruncate2To(u8, BitCast(u16, row0123_lz), BitCast(u16, row4567_lz)); /* Compute nbits needed to specify magnitude of each coefficient. */ auto row01234567_nbits = Sub(Set(u8, 16), row01234567_lz); /* Store nbits. */