Skip to content

Commit

Permalink
Merge pull request #24 from osamu620/develop
Browse files Browse the repository at this point in the history
Reduce the number of Interleaved Store & Load in color conversion and subsampling
  • Loading branch information
osamu620 authored Oct 3, 2023
2 parents fba7583 + ab64f78 commit 78f4e2e
Show file tree
Hide file tree
Showing 7 changed files with 257 additions and 197 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
set(CMAKE_CXX_FLAGS "-Wall -Wextra -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0 -g -fsanitize=address")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -DNDEBUG")
set(CMAKE_CXX_FLAGS_RelWithDebInfo "${CMAKE_CXX_FLAGS} -O2 -g -DNDEBUG")
set(CMAKE_CXX_FLAGS_RelWithDebInfo "${CMAKE_CXX_FLAGS} -O3 -g -DNDEBUG")
endif()

if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") # MSVC
Expand Down
2 changes: 1 addition & 1 deletion lib/block_coding_256.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ HWY_ALIGN uint64_t shift[4] = {24, 16, 8, 0};
const auto vs = Load(u64, shift);
a0 = Shl(a0, vs);
a1 = Shl(a1, vs);
bitmap = (GetLane(SumOfLanes(u64, a0)) << 32) + GetLane(SumOfLanes(u64, a1));
bitmap = (ReduceSum(u64, a0) << 32) | ReduceSum(u64, a1);

auto abs_row01 = Abs(row01);
auto abs_row23 = Abs(row23);
Expand Down
2 changes: 1 addition & 1 deletion lib/block_coding_512.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ auto a0 = SumsOf8(bitmap_rows_76543210);
HWY_ALIGN uint64_t shift[8] = {56, 48, 40, 32, 24, 16, 8, 0};
const auto vs = Load(u64, shift);
a0 = Shl(a0, vs);
bitmap = GetLane(SumOfLanes(u64, a0));
bitmap = ReduceSum(u64, a0);

auto abs_row0123 = Abs(row0123);
auto abs_row4567 = Abs(row4567);
Expand Down
385 changes: 214 additions & 171 deletions lib/color.cpp

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions lib/color.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
#include <vector>

namespace jpegenc_hwy {
void rgb2ycbcr(uint8_t *in, int width);
void subsample(uint8_t *in, std::vector<int16_t *> &out, int width, int YCCtype);
void rgb2ycbcr(uint8_t *in, std::vector<uint8_t *> &out, int width);
void subsample(std::vector<uint8_t *>, std::vector<int16_t *> &out, int width, int YCCtype);
} // namespace jpegenc_hwy
57 changes: 37 additions & 20 deletions lib/jpegenc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ class jpeg_encoder_impl {
int YCCtype;
const int rounded_width;
const int rounded_height;
std::vector<std::unique_ptr<int16_t[], hwy::AlignedFreer>> line_buffer;
std::vector<std::unique_ptr<uint8_t[], hwy::AlignedFreer>> line_buffer0;
std::vector<std::unique_ptr<int16_t[], hwy::AlignedFreer>> line_buffer1;
std::unique_ptr<int16_t[], hwy::AlignedFreer> mcu_buffer;
std::vector<int16_t *> yuv;
std::vector<uint8_t *> yuv0;
std::vector<int16_t *> yuv1;
int16_t *mcu;
HWY_ALIGN int16_t qtable[DCTSIZE2 * 2];
bitstream enc;
Expand All @@ -43,33 +45,44 @@ class jpeg_encoder_impl {
YCCtype(ycc),
rounded_width(round_up(inimg.width, DCTSIZE * (YCC_HV[YCCtype][0] >> 4))),
rounded_height(round_up(inimg.height, DCTSIZE * (YCC_HV[YCCtype][0] & 0xF))),
line_buffer(ncomp),
yuv(ncomp),
line_buffer0(ncomp),
line_buffer1(ncomp),
yuv0(ncomp),
yuv1(ncomp),
qtable{0},
enc(3000000),
use_RESET(false) {
int nc = inimg.nc;
if (nc == 1) {
int ncomp_out = inimg.nc;
if (ncomp_out == 1) {
YCCtype = YCC::GRAY;
}
nc = (YCCtype == YCC::GRAY2) ? 1 : nc;
ncomp_out = (YCCtype == YCC::GRAY2) ? 1 : ncomp_out;
const int scale_x = YCC_HV[YCCtype][0] >> 4;
const int scale_y = YCC_HV[YCCtype][0] & 0xF;
const size_t bufsize_L = rounded_width * LINES;
const size_t bufsize_C = rounded_width / scale_x * LINES / scale_y;

// Prepare line-buffers
line_buffer[0] = hwy::AllocateAligned<int16_t>(bufsize_L);
for (size_t c = 1; c < line_buffer.size(); ++c) {
line_buffer[c] = hwy::AllocateAligned<int16_t>(bufsize_C);
line_buffer0[0] = hwy::AllocateAligned<uint8_t>(bufsize_L);
for (int c = 1; c < ncomp; ++c) {
line_buffer0[c] = hwy::AllocateAligned<uint8_t>(bufsize_L);
}
yuv[0] = line_buffer[0].get();
for (int c = 1; c < nc; ++c) {
yuv[c] = line_buffer[c].get();
yuv0[0] = line_buffer0[0].get();
for (int c = 1; c < ncomp; ++c) {
yuv0[c] = line_buffer0[c].get();
}

line_buffer1[0] = hwy::AllocateAligned<int16_t>(bufsize_L);
for (size_t c = 1; c < line_buffer1.size(); ++c) {
line_buffer1[c] = hwy::AllocateAligned<int16_t>(bufsize_C);
}
yuv1[0] = line_buffer1[0].get();
for (int c = 1; c < ncomp_out; ++c) {
yuv1[c] = line_buffer1[c].get();
}

// Prepare mcu-buffers
const int c = (nc == 1) ? 1 : 0;
const int c = (ncomp_out == 1) ? 1 : 0;
mcu_buffer = hwy::AllocateAligned<int16_t>(DCTSIZE2 * scale_x * scale_y + ((DCTSIZE2 * 2) >> c));
mcu = mcu_buffer.get();
}
Expand All @@ -93,10 +106,12 @@ class jpeg_encoder_impl {
// Loop of 16 pixels height
for (int n = 0; n < rounded_height - LINES; n += LINES) {
if (ncomp == 3) {
jpegenc_hwy::rgb2ycbcr(src, rounded_width);
jpegenc_hwy::rgb2ycbcr(src, yuv0, rounded_width);
} else {
yuv0[0] = src;
}
jpegenc_hwy::subsample(src, yuv, rounded_width, YCCtype);
jpegenc_hwy::encode_lines(yuv, mcu, rounded_width, LINES, YCCtype, qtable, prev_dc, tab_Y, tab_C,
jpegenc_hwy::subsample(yuv0, yuv1, rounded_width, YCCtype);
jpegenc_hwy::encode_lines(yuv1, mcu, rounded_width, LINES, YCCtype, qtable, prev_dc, tab_Y, tab_C,
enc);
// RST marker insertion, if any
if (use_RESET) {
Expand All @@ -109,10 +124,12 @@ class jpeg_encoder_impl {
const int last_mcu_height = (rounded_height % LINES) ? DCTSIZE : LINES;

if (ncomp == 3) {
jpegenc_hwy::rgb2ycbcr(src, rounded_width);
jpegenc_hwy::rgb2ycbcr(src, yuv0, rounded_width);
} else {
yuv0[0] = src;
}
jpegenc_hwy::subsample(src, yuv, rounded_width, YCCtype);
jpegenc_hwy::encode_lines(yuv, mcu, rounded_width, last_mcu_height, YCCtype, qtable, prev_dc, tab_Y,
jpegenc_hwy::subsample(yuv0, yuv1, rounded_width, YCCtype);
jpegenc_hwy::encode_lines(yuv1, mcu, rounded_width, last_mcu_height, YCCtype, qtable, prev_dc, tab_Y,
tab_C, enc);

// Finalize codestream
Expand Down
2 changes: 1 addition & 1 deletion thirdparty/highway
Submodule highway updated 51 files
+33 −3 .github/workflows/build_test.yml
+70 −0 .github/workflows/codeql.yml
+18 −0 BUILD
+16 −4 CMakeLists.txt
+6 −2 README.md
+1 −1 g3doc/design_philosophy.md
+40 −4 g3doc/faq.md
+19 −18 g3doc/op_wishlist.md
+222 −113 g3doc/quick_reference.md
+1 −1 hwy/aligned_allocator_test.cc
+242 −85 hwy/base.h
+70 −2 hwy/base_test.cc
+1 −1 hwy/contrib/dot/dot_test.cc
+18 −28 hwy/contrib/math/math_test.cc
+1 −1 hwy/contrib/sort/algo-inl.h
+1 −10 hwy/contrib/sort/result-inl.h
+150 −0 hwy/contrib/thread_pool/thread_pool.cc
+174 −0 hwy/contrib/thread_pool/thread_pool.h
+130 −0 hwy/contrib/thread_pool/thread_pool_test.cc
+11 −2 hwy/contrib/unroller/unroller_test.cc
+3 −1 hwy/detect_compiler_arch.h
+13 −1 hwy/detect_targets.h
+2 −0 hwy/highway_test.cc
+186 −121 hwy/ops/arm_neon-inl.h
+378 −24 hwy/ops/arm_sve-inl.h
+49 −32 hwy/ops/emu128-inl.h
+625 −31 hwy/ops/generic_ops-inl.h
+365 −268 hwy/ops/ppc_vsx-inl.h
+198 −44 hwy/ops/rvv-inl.h
+36 −26 hwy/ops/scalar-inl.h
+18 −3 hwy/ops/set_macros-inl.h
+41 −12 hwy/ops/shared-inl.h
+35 −121 hwy/ops/wasm_128-inl.h
+28 −28 hwy/ops/wasm_256-inl.h
+517 −329 hwy/ops/x86_128-inl.h
+512 −214 hwy/ops/x86_256-inl.h
+421 −228 hwy/ops/x86_512-inl.h
+1 −237 hwy/tests/arithmetic_test.cc
+7 −7 hwy/tests/blockwise_test.cc
+62 −0 hwy/tests/combine_test.cc
+127 −0 hwy/tests/convert_test.cc
+9 −3 hwy/tests/mask_mem_test.cc
+276 −0 hwy/tests/masked_arithmetic_test.cc
+275 −0 hwy/tests/minmax_test.cc
+2 −2 hwy/tests/mul_test.cc
+115 −30 hwy/tests/reduction_test.cc
+5 −5 hwy/tests/shift_test.cc
+12 −0 hwy/tests/test_util-inl.h
+27 −0 hwy/tests/test_util.h
+1 −1 hwy/tests/widen_mul_test.cc
+11 −0 hwy/timer.h

0 comments on commit 78f4e2e

Please sign in to comment.