osamu620 · osamu620 · Oct 6, 2023 · Oct 6, 2023
diff --git a/lib/block_coding_128.cpp b/lib/block_coding_128.cpp
@@ -89,10 +89,14 @@ auto row4_ne_0  = VecFromMask(s16, Eq(row4, zero));
 auto row5_ne_0  = VecFromMask(s16, Eq(row5, zero));
 auto row6_ne_0  = VecFromMask(s16, Eq(row6, zero));
 auto row7_ne_0  = VecFromMask(s16, Eq(row7, zero));
-auto row10_ne_0 = ConcatEven(u8, BitCast(u8, row0_ne_0), BitCast(u8, row1_ne_0));
-auto row32_ne_0 = ConcatEven(u8, BitCast(u8, row2_ne_0), BitCast(u8, row3_ne_0));
-auto row54_ne_0 = ConcatEven(u8, BitCast(u8, row4_ne_0), BitCast(u8, row5_ne_0));
-auto row76_ne_0 = ConcatEven(u8, BitCast(u8, row6_ne_0), BitCast(u8, row7_ne_0));
+auto row10_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row1_ne_0), BitCast(u16, row0_ne_0));
+auto row32_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row3_ne_0), BitCast(u16, row2_ne_0));
+auto row54_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row5_ne_0), BitCast(u16, row4_ne_0));
+auto row76_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row7_ne_0), BitCast(u16, row6_ne_0));
+// auto row10_ne_0 = ConcatEven(u8, BitCast(u8, row0_ne_0), BitCast(u8, row1_ne_0));
+// auto row32_ne_0 = ConcatEven(u8, BitCast(u8, row2_ne_0), BitCast(u8, row3_ne_0));
+// auto row54_ne_0 = ConcatEven(u8, BitCast(u8, row4_ne_0), BitCast(u8, row5_ne_0));
+// auto row76_ne_0 = ConcatEven(u8, BitCast(u8, row6_ne_0), BitCast(u8, row7_ne_0));
 
 /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
 HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080};
@@ -129,10 +133,14 @@ auto row6_lz = LeadingZeroCount(abs_row6);
 auto row7_lz = LeadingZeroCount(abs_row7);
 
 /* Narrow leading zero count to 8 bits. */
-auto row01_lz = ConcatEven(u8, BitCast(u8, row1_lz), BitCast(u8, row0_lz));
-auto row23_lz = ConcatEven(u8, BitCast(u8, row3_lz), BitCast(u8, row2_lz));
-auto row45_lz = ConcatEven(u8, BitCast(u8, row5_lz), BitCast(u8, row4_lz));
-auto row67_lz = ConcatEven(u8, BitCast(u8, row7_lz), BitCast(u8, row6_lz));
+auto row01_lz = OrderedTruncate2To(u8, BitCast(u16, row0_lz), BitCast(u16, row1_lz));
+auto row23_lz = OrderedTruncate2To(u8, BitCast(u16, row2_lz), BitCast(u16, row3_lz));
+auto row45_lz = OrderedTruncate2To(u8, BitCast(u16, row4_lz), BitCast(u16, row5_lz));
+auto row67_lz = OrderedTruncate2To(u8, BitCast(u16, row6_lz), BitCast(u16, row7_lz));
+// auto row01_lz = ConcatEven(u8, BitCast(u8, row1_lz), BitCast(u8, row0_lz));
+// auto row23_lz = ConcatEven(u8, BitCast(u8, row3_lz), BitCast(u8, row2_lz));
+// auto row45_lz = ConcatEven(u8, BitCast(u8, row5_lz), BitCast(u8, row4_lz));
+// auto row67_lz = ConcatEven(u8, BitCast(u8, row7_lz), BitCast(u8, row6_lz));
 /* Compute nbits needed to specify magnitude of each coefficient. */
 const auto sixteen = Set(u8, 16);
 auto row01_nbits   = Sub(sixteen, row01_lz);

diff --git a/lib/block_coding_256.cpp b/lib/block_coding_256.cpp
@@ -67,8 +67,8 @@ auto row01_ne_0   = VecFromMask(s16, Eq(row01, zero));
 auto row23_ne_0   = VecFromMask(s16, Eq(row23, zero));
 auto row45_ne_0   = VecFromMask(s16, Eq(row45, zero));
 auto row67_ne_0   = VecFromMask(s16, Eq(row67, zero));
-auto row3210_ne_0 = ConcatEven(u8, BitCast(u8, row23_ne_0), BitCast(u8, row01_ne_0));
-auto row7654_ne_0 = ConcatEven(u8, BitCast(u8, row67_ne_0), BitCast(u8, row45_ne_0));
+auto row3210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row01_ne_0), BitCast(u16, row23_ne_0));
+auto row7654_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row45_ne_0), BitCast(u16, row67_ne_0));
 
 /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
 HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
@@ -96,8 +96,8 @@ auto row23_lz = LeadingZeroCount(abs_row23);
 auto row45_lz = LeadingZeroCount(abs_row45);
 auto row67_lz = LeadingZeroCount(abs_row67);
 /* Narrow leading zero count to 8 bits. */
-auto row0123_lz = ConcatEven(u8, BitCast(u8, row23_lz), BitCast(u8, row01_lz));
-auto row4567_lz = ConcatEven(u8, BitCast(u8, row67_lz), BitCast(u8, row45_lz));
+auto row0123_lz = OrderedTruncate2To(u8, BitCast(u16, row01_lz), BitCast(u16, row23_lz));
+auto row4567_lz = OrderedTruncate2To(u8, BitCast(u16, row45_lz), BitCast(u16, row67_lz));
 /* Compute nbits needed to specify magnitude of each coefficient. */
 auto row0123_nbits = Sub(Set(u8, 16), row0123_lz);
 auto row4567_nbits = Sub(Set(u8, 16), row4567_lz);

diff --git a/lib/block_coding_512.cpp b/lib/block_coding_512.cpp
@@ -28,7 +28,7 @@ auto row4567 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1
 auto zero             = Zero(s16);
 auto row0123_ne_0     = VecFromMask(s16, Eq(row0123, zero));
 auto row4567_ne_0     = VecFromMask(s16, Eq(row4567, zero));
-auto row76543210_ne_0 = ConcatEven(u8, BitCast(u8, row4567_ne_0), BitCast(u8, row0123_ne_0));
+auto row76543210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row0123_ne_0), BitCast(u16, row4567_ne_0));
 
 /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
 HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
@@ -50,7 +50,7 @@ auto abs_row4567 = Abs(row4567);
 auto row0123_lz = LeadingZeroCount(abs_row0123);
 auto row4567_lz = LeadingZeroCount(abs_row4567);
 /* Narrow leading zero count to 8 bits. */
-auto row01234567_lz = ConcatEven(u8, BitCast(u8, row4567_lz), BitCast(u8, row0123_lz));
+auto row01234567_lz = OrderedTruncate2To(u8, BitCast(u16, row0123_lz), BitCast(u16, row4567_lz));
 /* Compute nbits needed to specify magnitude of each coefficient. */
 auto row01234567_nbits = Sub(Set(u8, 16), row01234567_lz);
 /* Store nbits. */