Skip to content

Commit

Permalink
Merge pull request #31 from osamu620/develop
Browse files Browse the repository at this point in the history
Update CMakeLists.txt and improve huffman coding for AVX2
  • Loading branch information
osamu620 authored Oct 6, 2023
2 parents 55fc10a + a1a9be5 commit f788f96
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 67 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ set_target_properties(
#target_include_directories(hwytest PRIVATE thirdparty/hwy)
#target_link_libraries(hwytest PRIVATE hwy)

if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU|IntelLLVM")
set(CMAKE_CXX_FLAGS "-Wall -Wextra -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0 -g -fsanitize=address")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -DNDEBUG")
set(CMAKE_CXX_FLAGS_RelWithDebInfo "${CMAKE_CXX_FLAGS} -O3 -g -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -O3 -g -DNDEBUG")
endif()

if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") # MSVC
Expand Down
92 changes: 46 additions & 46 deletions lib/block_coding_128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,59 +21,59 @@ HWY_ALIGN constexpr int16_t indices[] = {
HWY_CAPPED(uint8_t, Lanes(u8) / 2) u8_64;
HWY_CAPPED(uint64_t, Lanes(u64) / 2) u64_64;

auto v0 = Load(s16, sp);
auto v1 = Load(s16, sp + 8);
auto v2 = Load(s16, sp + 16);
auto v3 = Load(s16, sp + 24);
auto v4 = Load(s16, sp + 32);
auto v5 = Load(s16, sp + 40);
auto v6 = Load(s16, sp + 48);
auto v7 = Load(s16, sp + 56);

auto row0 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 8]));
row0 = InsertLane(row0, 3, ExtractLane(v2, 0));
auto row1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 8]));
auto row1_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 8]));
auto row2 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[3 * 8]));
auto row3 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 8]));
auto row3_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 8]));
auto v0 = Load(s16, sp);
auto v1 = Load(s16, sp + 8);
auto row0 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 8]));
auto v2 = Load(s16, sp + 16);
row0 = InsertLane(row0, 3, ExtractLane(v2, 0));

auto row1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 8]));
auto v3 = Load(s16, sp + 24);
auto v4 = Load(s16, sp + 32);
auto v5 = Load(s16, sp + 40);
auto v6 = Load(s16, sp + 48);

auto row1_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 8]));
auto m5 = FirstN(s16, 5);
row1 = IfThenZeroElse(m5, row1);
row1_1 = IfThenElseZero(m5, row1_1);
row1 = Or(row1, row1_1);
row1 = InsertLane(row1, 2, ExtractLane(v4, 0));
auto row2 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[3 * 8]));
auto m3 = FirstN(s16, 3);
row2 = IfThenZeroElse(m3, row2);
row2 = InsertLane(row2, 0, ExtractLane(v1, 4));
row2 = InsertLane(row2, 1, ExtractLane(v2, 3));
row2 = InsertLane(row2, 2, ExtractLane(v3, 2));
row2 = InsertLane(row2, 5, ExtractLane(v6, 0));
auto v7 = Load(s16, sp + 56);
auto row3 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 8]));
auto row3_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 8]));
alignas(16) uint8_t mask34[8] = {0b00111100};
auto m34 = LoadMaskBits(s16, mask34);
row3 = IfThenZeroElse(m34, row3);
row3_1 = IfThenElseZero(m34, row3_1);
row3 = Or(row3, row3_1);

auto row4 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[6 * 8]));
auto row4_1 = TwoTablesLookupLanes(s16, v6, v7, SetTableIndices(s16, &indices[7 * 8]));
row4 = IfThenZeroElse(m34, row4);
row4_1 = IfThenElseZero(m34, row4_1);
row4 = Or(row4, row4_1);
auto row5 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[8 * 8]));
auto row6 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[9 * 8]));
auto row6_1 = TwoTablesLookupLanes(s16, v6, v7, SetTableIndices(s16, &indices[10 * 8]));
auto row7 = TwoTablesLookupLanes(s16, v6, v7, SetTableIndices(s16, &indices[11 * 8]));
row7 = InsertLane(row7, 4, ExtractLane(v5, 7));

auto m5 = FirstN(s16, 5);
auto m3 = FirstN(s16, 3);
HWY_ALIGN uint8_t mask34[8] = {0b00111100};
auto m34 = LoadMaskBits(s16, mask34);

row1 = IfThenZeroElse(m5, row1);
row1_1 = IfThenElseZero(m5, row1_1);
row1 = Or(row1, row1_1);
row1 = InsertLane(row1, 2, ExtractLane(v4, 0));
row2 = IfThenZeroElse(m3, row2);
row2 = InsertLane(row2, 0, ExtractLane(v1, 4));
row2 = InsertLane(row2, 1, ExtractLane(v2, 3));
row2 = InsertLane(row2, 2, ExtractLane(v3, 2));
row2 = InsertLane(row2, 5, ExtractLane(v6, 0));
row3 = IfThenZeroElse(m34, row3);
row3_1 = IfThenElseZero(m34, row3_1);
row3 = Or(row3, row3_1);
row4 = IfThenZeroElse(m34, row4);
row4_1 = IfThenElseZero(m34, row4_1);
row4 = Or(row4, row4_1);
row5 = IfThenZeroElse(Not(m5), row5);
row5 = InsertLane(row5, 2, ExtractLane(v1, 7));
row5 = InsertLane(row5, 5, ExtractLane(v4, 5));
row5 = InsertLane(row5, 6, ExtractLane(v5, 4));
row5 = InsertLane(row5, 7, ExtractLane(v6, 3));
row6 = IfThenZeroElse(m3, row6);
row6_1 = IfThenElseZero(m3, row6_1);
row6 = Or(row6, row6_1);
row6 = InsertLane(row6, 5, ExtractLane(v3, 7));
row5 = IfThenZeroElse(Not(m5), row5);
row5 = InsertLane(row5, 2, ExtractLane(v1, 7));
row5 = InsertLane(row5, 5, ExtractLane(v4, 5));
row5 = InsertLane(row5, 6, ExtractLane(v5, 4));
row5 = InsertLane(row5, 7, ExtractLane(v6, 3));
row6 = IfThenZeroElse(m3, row6);
row6_1 = IfThenElseZero(m3, row6_1);
row6 = Or(row6, row6_1);
row6 = InsertLane(row6, 5, ExtractLane(v3, 7));

/* DCT block is now in zig-zag order; start Huffman encoding process. */

Expand Down
39 changes: 20 additions & 19 deletions lib/block_coding_256.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,23 @@ HWY_ALIGN constexpr int16_t indices[] = {

auto v0 = Load(s16, sp);
auto v1 = Load(s16, sp + 16);
auto v2 = Load(s16, sp + 32);
auto v3 = Load(s16, sp + 48);

auto row01 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 16]));
auto row23 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 16]));
auto row45 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 16]));
auto row67 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[3 * 16]));
auto row23_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 16]));

auto row01 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 16]));
auto v2 = Load(s16, sp + 32);
row01 = InsertLane(row01, 10, ExtractLane(v2, 0));
auto row23 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 16]));
auto v3 = Load(s16, sp + 48);
auto row23_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 16]));
HWY_ALIGN uint8_t m1[8] = {0x07, 0xFF};
auto maskv1 = LoadMaskBits(s16, m1);
row23 = IfThenElseZero(maskv1, row23);
row23_1 = IfThenZeroElse(maskv1, row23_1);
row23 = Or(row23, row23_1);

auto row45 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 16]));
auto row67 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[3 * 16]));
row67 = InsertLane(row67, 5, ExtractLane(v1, 15));

auto row45_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 16]));

// HWY_ALIGN int16_t m[32] = {
Expand All @@ -42,19 +51,11 @@ auto row45_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5
// row23_1 = IfThenZeroElse(MaskFromVec(maskv1), row23_1);
// row45_1 = IfThenZeroElse(MaskFromVec(maskv2), row45_1);

HWY_ALIGN uint8_t m1[8] = {0x07, 0xFF};
HWY_ALIGN uint8_t m2[8] = {0xFF, 0xE0};
auto maskv1 = LoadMaskBits(s16, m1);
auto maskv2 = LoadMaskBits(s16, m2);

row23 = IfThenElseZero(maskv1, row23);
row45 = IfThenElseZero(maskv2, row45);
row23_1 = IfThenZeroElse(maskv1, row23_1);
row45_1 = IfThenZeroElse(maskv2, row45_1);
row23 = Or(row23, row23_1);
row45 = Or(row45, row45_1);
row01 = InsertLane(row01, 10, ExtractLane(v2, 0));
row67 = InsertLane(row67, 5, ExtractLane(v1, 15));
row45 = IfThenElseZero(maskv2, row45);
row45_1 = IfThenZeroElse(maskv2, row45_1);
row45 = Or(row45, row45_1);

/* DCT block is now in zig-zag order; start Huffman encoding process. */

Expand Down

0 comments on commit f788f96

Please sign in to comment.