diff --git a/CMakeLists.txt b/CMakeLists.txt index 2601d1d..8c1f08e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,11 +34,11 @@ set_target_properties( #target_include_directories(hwytest PRIVATE thirdparty/hwy) #target_link_libraries(hwytest PRIVATE hwy) -if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU|IntelLLVM") set(CMAKE_CXX_FLAGS "-Wall -Wextra -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0 -g -fsanitize=address") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -DNDEBUG") - set(CMAKE_CXX_FLAGS_RelWithDebInfo "${CMAKE_CXX_FLAGS} -O3 -g -DNDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -O3 -g -DNDEBUG") endif() if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") # MSVC diff --git a/lib/block_coding_128.cpp b/lib/block_coding_128.cpp index 5bb4791..96249e7 100644 --- a/lib/block_coding_128.cpp +++ b/lib/block_coding_128.cpp @@ -21,59 +21,59 @@ HWY_ALIGN constexpr int16_t indices[] = { HWY_CAPPED(uint8_t, Lanes(u8) / 2) u8_64; HWY_CAPPED(uint64_t, Lanes(u64) / 2) u64_64; -auto v0 = Load(s16, sp); -auto v1 = Load(s16, sp + 8); -auto v2 = Load(s16, sp + 16); -auto v3 = Load(s16, sp + 24); -auto v4 = Load(s16, sp + 32); -auto v5 = Load(s16, sp + 40); -auto v6 = Load(s16, sp + 48); -auto v7 = Load(s16, sp + 56); - -auto row0 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 8])); -row0 = InsertLane(row0, 3, ExtractLane(v2, 0)); -auto row1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 8])); -auto row1_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 8])); -auto row2 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[3 * 8])); -auto row3 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 8])); -auto row3_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 8])); +auto v0 = Load(s16, sp); +auto v1 = Load(s16, sp + 8); +auto row0 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 8])); +auto v2 = Load(s16, sp + 16); +row0 = InsertLane(row0, 3, ExtractLane(v2, 0)); + +auto row1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 8])); +auto v3 = Load(s16, sp + 24); +auto v4 = Load(s16, sp + 32); +auto v5 = Load(s16, sp + 40); +auto v6 = Load(s16, sp + 48); + +auto row1_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 8])); +auto m5 = FirstN(s16, 5); +row1 = IfThenZeroElse(m5, row1); +row1_1 = IfThenElseZero(m5, row1_1); +row1 = Or(row1, row1_1); +row1 = InsertLane(row1, 2, ExtractLane(v4, 0)); +auto row2 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[3 * 8])); +auto m3 = FirstN(s16, 3); +row2 = IfThenZeroElse(m3, row2); +row2 = InsertLane(row2, 0, ExtractLane(v1, 4)); +row2 = InsertLane(row2, 1, ExtractLane(v2, 3)); +row2 = InsertLane(row2, 2, ExtractLane(v3, 2)); +row2 = InsertLane(row2, 5, ExtractLane(v6, 0)); +auto v7 = Load(s16, sp + 56); +auto row3 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 8])); +auto row3_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 8])); +alignas(16) uint8_t mask34[8] = {0b00111100}; +auto m34 = LoadMaskBits(s16, mask34); +row3 = IfThenZeroElse(m34, row3); +row3_1 = IfThenElseZero(m34, row3_1); +row3 = Or(row3, row3_1); + auto row4 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[6 * 8])); auto row4_1 = TwoTablesLookupLanes(s16, v6, v7, SetTableIndices(s16, &indices[7 * 8])); +row4 = IfThenZeroElse(m34, row4); +row4_1 = IfThenElseZero(m34, row4_1); +row4 = Or(row4, row4_1); auto row5 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[8 * 8])); auto row6 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[9 * 8])); auto row6_1 = TwoTablesLookupLanes(s16, v6, v7, SetTableIndices(s16, &indices[10 * 8])); auto row7 = TwoTablesLookupLanes(s16, v6, v7, SetTableIndices(s16, &indices[11 * 8])); row7 = InsertLane(row7, 4, ExtractLane(v5, 7)); - -auto m5 = FirstN(s16, 5); -auto m3 = FirstN(s16, 3); -HWY_ALIGN uint8_t mask34[8] = {0b00111100}; -auto m34 = LoadMaskBits(s16, mask34); - -row1 = IfThenZeroElse(m5, row1); -row1_1 = IfThenElseZero(m5, row1_1); -row1 = Or(row1, row1_1); -row1 = InsertLane(row1, 2, ExtractLane(v4, 0)); -row2 = IfThenZeroElse(m3, row2); -row2 = InsertLane(row2, 0, ExtractLane(v1, 4)); -row2 = InsertLane(row2, 1, ExtractLane(v2, 3)); -row2 = InsertLane(row2, 2, ExtractLane(v3, 2)); -row2 = InsertLane(row2, 5, ExtractLane(v6, 0)); -row3 = IfThenZeroElse(m34, row3); -row3_1 = IfThenElseZero(m34, row3_1); -row3 = Or(row3, row3_1); -row4 = IfThenZeroElse(m34, row4); -row4_1 = IfThenElseZero(m34, row4_1); -row4 = Or(row4, row4_1); -row5 = IfThenZeroElse(Not(m5), row5); -row5 = InsertLane(row5, 2, ExtractLane(v1, 7)); -row5 = InsertLane(row5, 5, ExtractLane(v4, 5)); -row5 = InsertLane(row5, 6, ExtractLane(v5, 4)); -row5 = InsertLane(row5, 7, ExtractLane(v6, 3)); -row6 = IfThenZeroElse(m3, row6); -row6_1 = IfThenElseZero(m3, row6_1); -row6 = Or(row6, row6_1); -row6 = InsertLane(row6, 5, ExtractLane(v3, 7)); +row5 = IfThenZeroElse(Not(m5), row5); +row5 = InsertLane(row5, 2, ExtractLane(v1, 7)); +row5 = InsertLane(row5, 5, ExtractLane(v4, 5)); +row5 = InsertLane(row5, 6, ExtractLane(v5, 4)); +row5 = InsertLane(row5, 7, ExtractLane(v6, 3)); +row6 = IfThenZeroElse(m3, row6); +row6_1 = IfThenElseZero(m3, row6_1); +row6 = Or(row6, row6_1); +row6 = InsertLane(row6, 5, ExtractLane(v3, 7)); /* DCT block is now in zig-zag order; start Huffman encoding process. */ diff --git a/lib/block_coding_256.cpp b/lib/block_coding_256.cpp index b7de151..f45d03f 100644 --- a/lib/block_coding_256.cpp +++ b/lib/block_coding_256.cpp @@ -20,14 +20,23 @@ HWY_ALIGN constexpr int16_t indices[] = { auto v0 = Load(s16, sp); auto v1 = Load(s16, sp + 16); -auto v2 = Load(s16, sp + 32); -auto v3 = Load(s16, sp + 48); - -auto row01 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 16])); -auto row23 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 16])); -auto row45 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 16])); -auto row67 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[3 * 16])); -auto row23_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 16])); + +auto row01 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 16])); +auto v2 = Load(s16, sp + 32); +row01 = InsertLane(row01, 10, ExtractLane(v2, 0)); +auto row23 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 16])); +auto v3 = Load(s16, sp + 48); +auto row23_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 16])); +HWY_ALIGN uint8_t m1[8] = {0x07, 0xFF}; +auto maskv1 = LoadMaskBits(s16, m1); +row23 = IfThenElseZero(maskv1, row23); +row23_1 = IfThenZeroElse(maskv1, row23_1); +row23 = Or(row23, row23_1); + +auto row45 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 16])); +auto row67 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[3 * 16])); +row67 = InsertLane(row67, 5, ExtractLane(v1, 15)); + auto row45_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 16])); // HWY_ALIGN int16_t m[32] = { @@ -42,19 +51,11 @@ auto row45_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 // row23_1 = IfThenZeroElse(MaskFromVec(maskv1), row23_1); // row45_1 = IfThenZeroElse(MaskFromVec(maskv2), row45_1); -HWY_ALIGN uint8_t m1[8] = {0x07, 0xFF}; HWY_ALIGN uint8_t m2[8] = {0xFF, 0xE0}; -auto maskv1 = LoadMaskBits(s16, m1); auto maskv2 = LoadMaskBits(s16, m2); - -row23 = IfThenElseZero(maskv1, row23); -row45 = IfThenElseZero(maskv2, row45); -row23_1 = IfThenZeroElse(maskv1, row23_1); -row45_1 = IfThenZeroElse(maskv2, row45_1); -row23 = Or(row23, row23_1); -row45 = Or(row45, row45_1); -row01 = InsertLane(row01, 10, ExtractLane(v2, 0)); -row67 = InsertLane(row67, 5, ExtractLane(v1, 15)); +row45 = IfThenElseZero(maskv2, row45); +row45_1 = IfThenZeroElse(maskv2, row45_1); +row45 = Or(row45, row45_1); /* DCT block is now in zig-zag order; start Huffman encoding process. */