Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update CMakeLists.txt and improve huffman coding for AVX2 #31

Merged
merged 1 commit into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ set_target_properties(
#target_include_directories(hwytest PRIVATE thirdparty/hwy)
#target_link_libraries(hwytest PRIVATE hwy)

if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU|IntelLLVM")
set(CMAKE_CXX_FLAGS "-Wall -Wextra -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0 -g -fsanitize=address")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -DNDEBUG")
set(CMAKE_CXX_FLAGS_RelWithDebInfo "${CMAKE_CXX_FLAGS} -O3 -g -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -O3 -g -DNDEBUG")
endif()

if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") # MSVC
Expand Down
92 changes: 46 additions & 46 deletions lib/block_coding_128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,59 +21,59 @@ HWY_ALIGN constexpr int16_t indices[] = {
HWY_CAPPED(uint8_t, Lanes(u8) / 2) u8_64;
HWY_CAPPED(uint64_t, Lanes(u64) / 2) u64_64;

auto v0 = Load(s16, sp);
auto v1 = Load(s16, sp + 8);
auto v2 = Load(s16, sp + 16);
auto v3 = Load(s16, sp + 24);
auto v4 = Load(s16, sp + 32);
auto v5 = Load(s16, sp + 40);
auto v6 = Load(s16, sp + 48);
auto v7 = Load(s16, sp + 56);

auto row0 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 8]));
row0 = InsertLane(row0, 3, ExtractLane(v2, 0));
auto row1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 8]));
auto row1_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 8]));
auto row2 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[3 * 8]));
auto row3 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 8]));
auto row3_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 8]));
auto v0 = Load(s16, sp);
auto v1 = Load(s16, sp + 8);
auto row0 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 8]));
auto v2 = Load(s16, sp + 16);
row0 = InsertLane(row0, 3, ExtractLane(v2, 0));

auto row1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 8]));
auto v3 = Load(s16, sp + 24);
auto v4 = Load(s16, sp + 32);
auto v5 = Load(s16, sp + 40);
auto v6 = Load(s16, sp + 48);

auto row1_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 8]));
auto m5 = FirstN(s16, 5);
row1 = IfThenZeroElse(m5, row1);
row1_1 = IfThenElseZero(m5, row1_1);
row1 = Or(row1, row1_1);
row1 = InsertLane(row1, 2, ExtractLane(v4, 0));
auto row2 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[3 * 8]));
auto m3 = FirstN(s16, 3);
row2 = IfThenZeroElse(m3, row2);
row2 = InsertLane(row2, 0, ExtractLane(v1, 4));
row2 = InsertLane(row2, 1, ExtractLane(v2, 3));
row2 = InsertLane(row2, 2, ExtractLane(v3, 2));
row2 = InsertLane(row2, 5, ExtractLane(v6, 0));
auto v7 = Load(s16, sp + 56);
auto row3 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 8]));
auto row3_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 8]));
alignas(16) uint8_t mask34[8] = {0b00111100};
auto m34 = LoadMaskBits(s16, mask34);
row3 = IfThenZeroElse(m34, row3);
row3_1 = IfThenElseZero(m34, row3_1);
row3 = Or(row3, row3_1);

auto row4 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[6 * 8]));
auto row4_1 = TwoTablesLookupLanes(s16, v6, v7, SetTableIndices(s16, &indices[7 * 8]));
row4 = IfThenZeroElse(m34, row4);
row4_1 = IfThenElseZero(m34, row4_1);
row4 = Or(row4, row4_1);
auto row5 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[8 * 8]));
auto row6 = TwoTablesLookupLanes(s16, v4, v5, SetTableIndices(s16, &indices[9 * 8]));
auto row6_1 = TwoTablesLookupLanes(s16, v6, v7, SetTableIndices(s16, &indices[10 * 8]));
auto row7 = TwoTablesLookupLanes(s16, v6, v7, SetTableIndices(s16, &indices[11 * 8]));
row7 = InsertLane(row7, 4, ExtractLane(v5, 7));

auto m5 = FirstN(s16, 5);
auto m3 = FirstN(s16, 3);
HWY_ALIGN uint8_t mask34[8] = {0b00111100};
auto m34 = LoadMaskBits(s16, mask34);

row1 = IfThenZeroElse(m5, row1);
row1_1 = IfThenElseZero(m5, row1_1);
row1 = Or(row1, row1_1);
row1 = InsertLane(row1, 2, ExtractLane(v4, 0));
row2 = IfThenZeroElse(m3, row2);
row2 = InsertLane(row2, 0, ExtractLane(v1, 4));
row2 = InsertLane(row2, 1, ExtractLane(v2, 3));
row2 = InsertLane(row2, 2, ExtractLane(v3, 2));
row2 = InsertLane(row2, 5, ExtractLane(v6, 0));
row3 = IfThenZeroElse(m34, row3);
row3_1 = IfThenElseZero(m34, row3_1);
row3 = Or(row3, row3_1);
row4 = IfThenZeroElse(m34, row4);
row4_1 = IfThenElseZero(m34, row4_1);
row4 = Or(row4, row4_1);
row5 = IfThenZeroElse(Not(m5), row5);
row5 = InsertLane(row5, 2, ExtractLane(v1, 7));
row5 = InsertLane(row5, 5, ExtractLane(v4, 5));
row5 = InsertLane(row5, 6, ExtractLane(v5, 4));
row5 = InsertLane(row5, 7, ExtractLane(v6, 3));
row6 = IfThenZeroElse(m3, row6);
row6_1 = IfThenElseZero(m3, row6_1);
row6 = Or(row6, row6_1);
row6 = InsertLane(row6, 5, ExtractLane(v3, 7));
row5 = IfThenZeroElse(Not(m5), row5);
row5 = InsertLane(row5, 2, ExtractLane(v1, 7));
row5 = InsertLane(row5, 5, ExtractLane(v4, 5));
row5 = InsertLane(row5, 6, ExtractLane(v5, 4));
row5 = InsertLane(row5, 7, ExtractLane(v6, 3));
row6 = IfThenZeroElse(m3, row6);
row6_1 = IfThenElseZero(m3, row6_1);
row6 = Or(row6, row6_1);
row6 = InsertLane(row6, 5, ExtractLane(v3, 7));

/* DCT block is now in zig-zag order; start Huffman encoding process. */

Expand Down
39 changes: 20 additions & 19 deletions lib/block_coding_256.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,23 @@ HWY_ALIGN constexpr int16_t indices[] = {

auto v0 = Load(s16, sp);
auto v1 = Load(s16, sp + 16);
auto v2 = Load(s16, sp + 32);
auto v3 = Load(s16, sp + 48);

auto row01 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 16]));
auto row23 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 16]));
auto row45 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 16]));
auto row67 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[3 * 16]));
auto row23_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 16]));

auto row01 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[0 * 16]));
auto v2 = Load(s16, sp + 32);
row01 = InsertLane(row01, 10, ExtractLane(v2, 0));
auto row23 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[1 * 16]));
auto v3 = Load(s16, sp + 48);
auto row23_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 16]));
HWY_ALIGN uint8_t m1[8] = {0x07, 0xFF};
auto maskv1 = LoadMaskBits(s16, m1);
row23 = IfThenElseZero(maskv1, row23);
row23_1 = IfThenZeroElse(maskv1, row23_1);
row23 = Or(row23, row23_1);

auto row45 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[2 * 16]));
auto row67 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[3 * 16]));
row67 = InsertLane(row67, 5, ExtractLane(v1, 15));

auto row45_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 16]));

// HWY_ALIGN int16_t m[32] = {
Expand All @@ -42,19 +51,11 @@ auto row45_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5
// row23_1 = IfThenZeroElse(MaskFromVec(maskv1), row23_1);
// row45_1 = IfThenZeroElse(MaskFromVec(maskv2), row45_1);

HWY_ALIGN uint8_t m1[8] = {0x07, 0xFF};
HWY_ALIGN uint8_t m2[8] = {0xFF, 0xE0};
auto maskv1 = LoadMaskBits(s16, m1);
auto maskv2 = LoadMaskBits(s16, m2);

row23 = IfThenElseZero(maskv1, row23);
row45 = IfThenElseZero(maskv2, row45);
row23_1 = IfThenZeroElse(maskv1, row23_1);
row45_1 = IfThenZeroElse(maskv2, row45_1);
row23 = Or(row23, row23_1);
row45 = Or(row45, row45_1);
row01 = InsertLane(row01, 10, ExtractLane(v2, 0));
row67 = InsertLane(row67, 5, ExtractLane(v1, 15));
row45 = IfThenElseZero(maskv2, row45);
row45_1 = IfThenZeroElse(maskv2, row45_1);
row45 = Or(row45, row45_1);

/* DCT block is now in zig-zag order; start Huffman encoding process. */

Expand Down