Skip to content

Commit

Permalink
Merge pull request #33 from osamu620/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
osamu620 authored Oct 12, 2023
2 parents 436efcd + d956e03 commit 0ca69f1
Show file tree
Hide file tree
Showing 8 changed files with 31 additions and 29 deletions.
30 changes: 16 additions & 14 deletions lib/bitstream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@

#include "jpgmarkers.hpp"

#define BIT_BUF_SIZE 64
#define BYTE_BUF_SIZE 8

#define USE_VECTOR 0

namespace jpegenc_hwy {
Expand All @@ -32,13 +35,11 @@ class stream_buf {
}

inline void expand() {
uint8_t *p = buf.release();
std::unique_ptr<uint8_t[]> new_buf = std::make_unique<uint8_t[]>(len + len);
memcpy(new_buf.get(), p, len);
buf = std::move(new_buf);
memcpy(new_buf.get(), buf.get(), len);
buf.swap(new_buf);
new_buf.reset();
len += len;
delete[] p;
// __builtin_prefetch(buf.get() + pos, 0, 1);
cur_byte = buf.get() + pos;
}

Expand All @@ -51,7 +52,7 @@ class stream_buf {
}

inline void put_qword(uint64_t val) {
if (pos + 8 > len) {
if (pos + BYTE_BUF_SIZE > len) {
expand();
}
// emits eight uint8_t values at once
Expand All @@ -61,10 +62,11 @@ class stream_buf {
*(uint64_t *)cur_byte = __builtin_bswap64(val);
#elif HWY_TARGET <= HWY_SSE2
*(uint64_t *)cur_byte = __bswap_64(val);
#else
jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte);
#endif
// jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte);
cur_byte += 8;
pos += 8;
cur_byte += BYTE_BUF_SIZE;
pos += BYTE_BUF_SIZE;
}

uint8_t *get_buf() {
Expand Down Expand Up @@ -128,11 +130,11 @@ class bitstream {
// int n = (bits + 8 - 1) / 8;
// tmp <<= 8 * n - bits;
// tmp |= ~(0xFFFFFFFFFFFFFFFFUL << (8 * n - bits));
const int bits_to_flush = 64 - bits;
const int bits_to_flush = BIT_BUF_SIZE - bits;
int n = (bits_to_flush + 8 - 1) / 8;
tmp <<= 8 * n - bits_to_flush;
tmp |= ~(0xFFFFFFFFFFFFFFFFUL << (8 * n - bits_to_flush));
uint64_t mask = 0xFF00000000000000UL >> (64 - n * 8);
uint64_t mask = 0xFF00000000000000UL >> (BIT_BUF_SIZE - n * 8);
for (int i = n - 1; i >= 0; --i) {
uint8_t upper_byte = (tmp & mask) >> (8 * i);
put_byte(upper_byte);
Expand All @@ -143,7 +145,7 @@ class bitstream {
mask >>= 8;
}
tmp = 0;
bits = 0;
bits = BIT_BUF_SIZE;
}

public:
Expand All @@ -153,7 +155,7 @@ class bitstream {
explicit bitstream(size_t length) : bits(0), tmp(0) { stream.reserve(length); }
inline void put_byte(uint8_t d) { stream.push_back(d); }
#else
explicit bitstream(size_t length) : bits(64), tmp(0), stream(length) {}
explicit bitstream(size_t length) : bits(BIT_BUF_SIZE), tmp(0), stream(length) {}
inline void put_byte(uint8_t d) { stream.put_byte(d); }
#endif

Expand All @@ -171,7 +173,7 @@ class bitstream {
// PUT_AND_FLUSH
tmp = (tmp << (len + bits)) | (cwd >> -bits);
emit_qword(tmp);
bits += 64;
bits += BIT_BUF_SIZE;
tmp = cwd;
} else {
tmp = (tmp << len) | cwd;
Expand Down
3 changes: 2 additions & 1 deletion lib/block_coding_128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ auto bitmap_rows_7654 = Padd(u8, bitmap_rows_76, bitmap_rows_54);
auto bitmap_rows_76543210 = Padd(u8, bitmap_rows_7654, bitmap_rows_3210);
auto bitmap_all = Padd(u8_64, LowerHalf(bitmap_rows_76543210), UpperHalf(u8_64, bitmap_rows_76543210));
/* Move bitmap to 64-bit scalar register. */
bitmap = GetLane(BitCast(u64_64, bitmap_all));
Store(BitCast(u64_64, bitmap_all), u64_64, &bitmap);
// bitmap = GetLane(BitCast(u64_64, bitmap_all));

auto abs_row0 = Abs(row0);
auto abs_row1 = Abs(row1);
Expand Down
5 changes: 2 additions & 3 deletions lib/block_coding_256.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,8 @@ auto row3210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row01_ne_0), BitCast(u16
auto row7654_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row45_ne_0), BitCast(u16, row67_ne_0));

/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
0x0102040810204080};
auto bitmap_mask = BitCast(u8, Load(u64, bm));
HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080};
auto bitmap_mask = BitCast(u8, LoadDup128(u64, bm));

auto bitmap_rows_3210 = AndNot(row3210_ne_0, bitmap_mask);
auto bitmap_rows_7654 = AndNot(row7654_ne_0, bitmap_mask);
Expand Down
6 changes: 2 additions & 4 deletions lib/block_coding_512.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,8 @@ auto row4567_ne_0 = VecFromMask(s16, Eq(row4567, zero));
auto row76543210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row0123_ne_0), BitCast(u16, row4567_ne_0));

/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
0x0102040810204080, 0x0102040810204080};
auto bitmap_mask = BitCast(u8, Load(u64, bm));
HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080};
auto bitmap_mask = BitCast(u8, LoadDup128(u64, bm));

auto bitmap_rows_76543210 = AndNot(row76543210_ne_0, bitmap_mask);
auto a0 = SumsOf8(bitmap_rows_76543210);
Expand Down
8 changes: 4 additions & 4 deletions lib/color.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ HWY_ATTR void rgb2ycbcr(uint8_t *HWY_RESTRICT in, std::vector<uint8_t *> &out, i
auto v1 = Undefined(u8);
auto v2 = Undefined(u8);

uint8_t *o0 = out[0];
uint8_t *o1 = out[1];
uint8_t *o2 = out[2];
constexpr size_t N = Lanes(u8);
uint8_t *HWY_RESTRICT o0 = out[0];
uint8_t *HWY_RESTRICT o1 = out[1];
uint8_t *HWY_RESTRICT o2 = out[2];
constexpr size_t N = Lanes(u8);
for (size_t i = width * LINES; i > 0; i -= N) {
LoadInterleaved3(u8, in, v0, v1, v2);

Expand Down
3 changes: 2 additions & 1 deletion lib/image_chunk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "constants.hpp"
#include "ycctype.hpp"
#include "hwy/ops/set_macros-inl.h"

class imchunk {
private:
Expand All @@ -27,7 +28,7 @@ class imchunk {
: width(w),
height(h),
ncomp(nc),
rounded_width(round_up(width, DCTSIZE * (YCC_HV[YCCtype][0] >> 4))),
rounded_width(round_up(width, HWY_MAX(DCTSIZE * (YCC_HV[YCCtype][0] >> 4), HWY_MAX_BYTES))),
origin(p),
g_buf(imdata),
buf(hwy::AllocateAligned<uint8_t>(static_cast<size_t>(width) * ncomp * LINES)),
Expand Down
4 changes: 2 additions & 2 deletions lib/jpegenc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ class jpeg_encoder_impl {
ncomp(inimg.nc),
QF(qf),
YCCtype(ycc),
rounded_width(round_up(inimg.width, DCTSIZE * (YCC_HV[YCCtype][0] >> 4))),
rounded_height(round_up(inimg.height, DCTSIZE * (YCC_HV[YCCtype][0] & 0xF))),
rounded_width(round_up(width, HWY_MAX(DCTSIZE * (YCC_HV[YCCtype][0] >> 4), HWY_MAX_BYTES))),
rounded_height(round_up(height, DCTSIZE * (YCC_HV[YCCtype][0] & 0xF))),
line_buffer0(ncomp),
line_buffer1(ncomp),
yuv0(ncomp),
Expand Down
1 change: 1 addition & 0 deletions lib/jpgheaders.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ void create_DHT(int c, bitstream &enc) {
}
}
std::vector<uint8_t> tmp;
tmp.reserve(256);
// Li
for (int f : freq) {
tmp.push_back(f);
Expand Down

0 comments on commit 0ca69f1

Please sign in to comment.