From 1dc6e6dd776f3cc96de5ee20c43bd68fa0df32fa Mon Sep 17 00:00:00 2001 From: OSAMU WATANABE Date: Thu, 12 Oct 2023 14:54:19 +0900 Subject: [PATCH 1/6] Padding for longer SIMD register --- lib/image_chunk.hpp | 3 ++- lib/jpegenc.cpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/image_chunk.hpp b/lib/image_chunk.hpp index e1d6146..cf4970e 100644 --- a/lib/image_chunk.hpp +++ b/lib/image_chunk.hpp @@ -9,6 +9,7 @@ #include "constants.hpp" #include "ycctype.hpp" +#include "hwy/ops/set_macros-inl.h" class imchunk { private: @@ -27,7 +28,7 @@ class imchunk { : width(w), height(h), ncomp(nc), - rounded_width(round_up(width, DCTSIZE * (YCC_HV[YCCtype][0] >> 4))), + rounded_width(round_up(width, HWY_MAX(DCTSIZE * (YCC_HV[YCCtype][0] >> 4), HWY_MAX_BYTES))), origin(p), g_buf(imdata), buf(hwy::AllocateAligned(static_cast(width) * ncomp * LINES)), diff --git a/lib/jpegenc.cpp b/lib/jpegenc.cpp index b3d34ae..112295c 100644 --- a/lib/jpegenc.cpp +++ b/lib/jpegenc.cpp @@ -42,8 +42,8 @@ class jpeg_encoder_impl { ncomp(inimg.nc), QF(qf), YCCtype(ycc), - rounded_width(round_up(inimg.width, DCTSIZE * (YCC_HV[YCCtype][0] >> 4))), - rounded_height(round_up(inimg.height, DCTSIZE * (YCC_HV[YCCtype][0] & 0xF))), + rounded_width(round_up(width, HWY_MAX(DCTSIZE * (YCC_HV[YCCtype][0] >> 4), HWY_MAX_BYTES))), + rounded_height(round_up(height, DCTSIZE * (YCC_HV[YCCtype][0] & 0xF))), line_buffer0(ncomp), line_buffer1(ncomp), yuv0(ncomp), From 88fa36d08fefaaea30a16057703d21e2c13f162b Mon Sep 17 00:00:00 2001 From: OSAMU WATANABE Date: Thu, 12 Oct 2023 14:55:12 +0900 Subject: [PATCH 2/6] Simplification of expanding buffer size --- lib/bitstream.hpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/bitstream.hpp b/lib/bitstream.hpp index 943caec..ad35734 100644 --- a/lib/bitstream.hpp +++ b/lib/bitstream.hpp @@ -32,13 +32,11 @@ class stream_buf { } inline void expand() { - uint8_t *p = buf.release(); std::unique_ptr new_buf = std::make_unique(len + len); - memcpy(new_buf.get(), p, len); - buf = std::move(new_buf); + memcpy(new_buf.get(), buf.get(), len); + buf.swap(new_buf); + new_buf.reset(); len += len; - delete[] p; - // __builtin_prefetch(buf.get() + pos, 0, 1); cur_byte = buf.get() + pos; } From 62df6695e4befea267e638940d37bf4b13e3d691 Mon Sep 17 00:00:00 2001 From: OSAMU WATANABE Date: Thu, 12 Oct 2023 14:56:55 +0900 Subject: [PATCH 3/6] Improve speed and readability a little --- lib/block_coding_128.cpp | 3 ++- lib/block_coding_256.cpp | 5 ++--- lib/block_coding_512.cpp | 6 ++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/lib/block_coding_128.cpp b/lib/block_coding_128.cpp index b012647..29824b2 100644 --- a/lib/block_coding_128.cpp +++ b/lib/block_coding_128.cpp @@ -112,7 +112,8 @@ auto bitmap_rows_7654 = Padd(u8, bitmap_rows_76, bitmap_rows_54); auto bitmap_rows_76543210 = Padd(u8, bitmap_rows_7654, bitmap_rows_3210); auto bitmap_all = Padd(u8_64, LowerHalf(bitmap_rows_76543210), UpperHalf(u8_64, bitmap_rows_76543210)); /* Move bitmap to 64-bit scalar register. */ -bitmap = GetLane(BitCast(u64_64, bitmap_all)); +Store(BitCast(u64_64, bitmap_all), u64_64, &bitmap); +// bitmap = GetLane(BitCast(u64_64, bitmap_all)); auto abs_row0 = Abs(row0); auto abs_row1 = Abs(row1); diff --git a/lib/block_coding_256.cpp b/lib/block_coding_256.cpp index 73372b8..189d062 100644 --- a/lib/block_coding_256.cpp +++ b/lib/block_coding_256.cpp @@ -71,9 +71,8 @@ auto row3210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row01_ne_0), BitCast(u16 auto row7654_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row45_ne_0), BitCast(u16, row67_ne_0)); /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ -HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080, - 0x0102040810204080}; -auto bitmap_mask = BitCast(u8, Load(u64, bm)); +HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080}; +auto bitmap_mask = BitCast(u8, LoadDup128(u64, bm)); auto bitmap_rows_3210 = AndNot(row3210_ne_0, bitmap_mask); auto bitmap_rows_7654 = AndNot(row7654_ne_0, bitmap_mask); diff --git a/lib/block_coding_512.cpp b/lib/block_coding_512.cpp index b59733d..1d5cea6 100644 --- a/lib/block_coding_512.cpp +++ b/lib/block_coding_512.cpp @@ -31,10 +31,8 @@ auto row4567_ne_0 = VecFromMask(s16, Eq(row4567, zero)); auto row76543210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row0123_ne_0), BitCast(u16, row4567_ne_0)); /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ -HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080, - 0x0102040810204080, 0x0102040810204080, 0x0102040810204080, - 0x0102040810204080, 0x0102040810204080}; -auto bitmap_mask = BitCast(u8, Load(u64, bm)); +HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080}; +auto bitmap_mask = BitCast(u8, LoadDup128(u64, bm)); auto bitmap_rows_76543210 = AndNot(row76543210_ne_0, bitmap_mask); auto a0 = SumsOf8(bitmap_rows_76543210); From 388bc3e1806b36c9cc0aa8be87562e4811803180 Mon Sep 17 00:00:00 2001 From: OSAMU WATANABE Date: Thu, 12 Oct 2023 14:57:19 +0900 Subject: [PATCH 4/6] Fix bug in flush() --- lib/bitstream.hpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/lib/bitstream.hpp b/lib/bitstream.hpp index ad35734..258634a 100644 --- a/lib/bitstream.hpp +++ b/lib/bitstream.hpp @@ -10,6 +10,9 @@ #include "jpgmarkers.hpp" +#define BIT_BUF_SIZE 64 +#define BYTE_BUF_SIZE 8 + #define USE_VECTOR 0 namespace jpegenc_hwy { @@ -49,7 +52,7 @@ class stream_buf { } inline void put_qword(uint64_t val) { - if (pos + 8 > len) { + if (pos + BYTE_BUF_SIZE > len) { expand(); } // emits eight uint8_t values at once @@ -59,10 +62,11 @@ class stream_buf { *(uint64_t *)cur_byte = __builtin_bswap64(val); #elif HWY_TARGET <= HWY_SSE2 *(uint64_t *)cur_byte = __bswap_64(val); +#else + jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte); #endif - // jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte); - cur_byte += 8; - pos += 8; + cur_byte += BYTE_BUF_SIZE; + pos += BYTE_BUF_SIZE; } uint8_t *get_buf() { @@ -126,11 +130,11 @@ class bitstream { // int n = (bits + 8 - 1) / 8; // tmp <<= 8 * n - bits; // tmp |= ~(0xFFFFFFFFFFFFFFFFUL << (8 * n - bits)); - const int bits_to_flush = 64 - bits; + const int bits_to_flush = BIT_BUF_SIZE - bits; int n = (bits_to_flush + 8 - 1) / 8; tmp <<= 8 * n - bits_to_flush; tmp |= ~(0xFFFFFFFFFFFFFFFFUL << (8 * n - bits_to_flush)); - uint64_t mask = 0xFF00000000000000UL >> (64 - n * 8); + uint64_t mask = 0xFF00000000000000UL >> (BIT_BUF_SIZE - n * 8); for (int i = n - 1; i >= 0; --i) { uint8_t upper_byte = (tmp & mask) >> (8 * i); put_byte(upper_byte); @@ -141,7 +145,7 @@ class bitstream { mask >>= 8; } tmp = 0; - bits = 0; + bits = BIT_BUF_SIZE; } public: @@ -151,7 +155,7 @@ class bitstream { explicit bitstream(size_t length) : bits(0), tmp(0) { stream.reserve(length); } inline void put_byte(uint8_t d) { stream.push_back(d); } #else - explicit bitstream(size_t length) : bits(64), tmp(0), stream(length) {} + explicit bitstream(size_t length) : bits(BIT_BUF_SIZE), tmp(0), stream(length) {} inline void put_byte(uint8_t d) { stream.put_byte(d); } #endif @@ -169,7 +173,7 @@ class bitstream { // PUT_AND_FLUSH tmp = (tmp << (len + bits)) | (cwd >> -bits); emit_qword(tmp); - bits += 64; + bits += BIT_BUF_SIZE; tmp = cwd; } else { tmp = (tmp << len) | cwd; From 912a10bae31c564904609377e36b46478f68259f Mon Sep 17 00:00:00 2001 From: OSAMU WATANABE Date: Thu, 12 Oct 2023 14:59:24 +0900 Subject: [PATCH 5/6] Reduce number of temporal allocation --- lib/jpgheaders.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/jpgheaders.cpp b/lib/jpgheaders.cpp index 931e5d3..2608b58 100644 --- a/lib/jpgheaders.cpp +++ b/lib/jpgheaders.cpp @@ -74,6 +74,7 @@ void create_DHT(int c, bitstream &enc) { } } std::vector tmp; + tmp.reserve(256); // Li for (int f : freq) { tmp.push_back(f); From d956e03be7b39c4a1ef6aa5ed7319f5181e7516e Mon Sep 17 00:00:00 2001 From: OSAMU WATANABE Date: Thu, 12 Oct 2023 14:59:52 +0900 Subject: [PATCH 6/6] Add __restrict__ attribute for output pointers --- lib/color.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/color.cpp b/lib/color.cpp index 53d25b5..d11d576 100644 --- a/lib/color.cpp +++ b/lib/color.cpp @@ -29,10 +29,10 @@ HWY_ATTR void rgb2ycbcr(uint8_t *HWY_RESTRICT in, std::vector &out, i auto v1 = Undefined(u8); auto v2 = Undefined(u8); - uint8_t *o0 = out[0]; - uint8_t *o1 = out[1]; - uint8_t *o2 = out[2]; - constexpr size_t N = Lanes(u8); + uint8_t *HWY_RESTRICT o0 = out[0]; + uint8_t *HWY_RESTRICT o1 = out[1]; + uint8_t *HWY_RESTRICT o2 = out[2]; + constexpr size_t N = Lanes(u8); for (size_t i = width * LINES; i > 0; i -= N) { LoadInterleaved3(u8, in, v0, v1, v2);