Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #33

Merged
merged 6 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions lib/bitstream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@

#include "jpgmarkers.hpp"

#define BIT_BUF_SIZE 64
#define BYTE_BUF_SIZE 8

#define USE_VECTOR 0

namespace jpegenc_hwy {
Expand All @@ -32,13 +35,11 @@ class stream_buf {
}

inline void expand() {
uint8_t *p = buf.release();
std::unique_ptr<uint8_t[]> new_buf = std::make_unique<uint8_t[]>(len + len);
memcpy(new_buf.get(), p, len);
buf = std::move(new_buf);
memcpy(new_buf.get(), buf.get(), len);
buf.swap(new_buf);
new_buf.reset();
len += len;
delete[] p;
// __builtin_prefetch(buf.get() + pos, 0, 1);
cur_byte = buf.get() + pos;
}

Expand All @@ -51,7 +52,7 @@ class stream_buf {
}

inline void put_qword(uint64_t val) {
if (pos + 8 > len) {
if (pos + BYTE_BUF_SIZE > len) {
expand();
}
// emits eight uint8_t values at once
Expand All @@ -61,10 +62,11 @@ class stream_buf {
*(uint64_t *)cur_byte = __builtin_bswap64(val);
#elif HWY_TARGET <= HWY_SSE2
*(uint64_t *)cur_byte = __bswap_64(val);
#else
jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte);
#endif
// jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte);
cur_byte += 8;
pos += 8;
cur_byte += BYTE_BUF_SIZE;
pos += BYTE_BUF_SIZE;
}

uint8_t *get_buf() {
Expand Down Expand Up @@ -128,11 +130,11 @@ class bitstream {
// int n = (bits + 8 - 1) / 8;
// tmp <<= 8 * n - bits;
// tmp |= ~(0xFFFFFFFFFFFFFFFFUL << (8 * n - bits));
const int bits_to_flush = 64 - bits;
const int bits_to_flush = BIT_BUF_SIZE - bits;
int n = (bits_to_flush + 8 - 1) / 8;
tmp <<= 8 * n - bits_to_flush;
tmp |= ~(0xFFFFFFFFFFFFFFFFUL << (8 * n - bits_to_flush));
uint64_t mask = 0xFF00000000000000UL >> (64 - n * 8);
uint64_t mask = 0xFF00000000000000UL >> (BIT_BUF_SIZE - n * 8);
for (int i = n - 1; i >= 0; --i) {
uint8_t upper_byte = (tmp & mask) >> (8 * i);
put_byte(upper_byte);
Expand All @@ -143,7 +145,7 @@ class bitstream {
mask >>= 8;
}
tmp = 0;
bits = 0;
bits = BIT_BUF_SIZE;
}

public:
Expand All @@ -153,7 +155,7 @@ class bitstream {
explicit bitstream(size_t length) : bits(0), tmp(0) { stream.reserve(length); }
inline void put_byte(uint8_t d) { stream.push_back(d); }
#else
explicit bitstream(size_t length) : bits(64), tmp(0), stream(length) {}
explicit bitstream(size_t length) : bits(BIT_BUF_SIZE), tmp(0), stream(length) {}
inline void put_byte(uint8_t d) { stream.put_byte(d); }
#endif

Expand All @@ -171,7 +173,7 @@ class bitstream {
// PUT_AND_FLUSH
tmp = (tmp << (len + bits)) | (cwd >> -bits);
emit_qword(tmp);
bits += 64;
bits += BIT_BUF_SIZE;
tmp = cwd;
} else {
tmp = (tmp << len) | cwd;
Expand Down
3 changes: 2 additions & 1 deletion lib/block_coding_128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ auto bitmap_rows_7654 = Padd(u8, bitmap_rows_76, bitmap_rows_54);
auto bitmap_rows_76543210 = Padd(u8, bitmap_rows_7654, bitmap_rows_3210);
auto bitmap_all = Padd(u8_64, LowerHalf(bitmap_rows_76543210), UpperHalf(u8_64, bitmap_rows_76543210));
/* Move bitmap to 64-bit scalar register. */
bitmap = GetLane(BitCast(u64_64, bitmap_all));
Store(BitCast(u64_64, bitmap_all), u64_64, &bitmap);
// bitmap = GetLane(BitCast(u64_64, bitmap_all));

auto abs_row0 = Abs(row0);
auto abs_row1 = Abs(row1);
Expand Down
5 changes: 2 additions & 3 deletions lib/block_coding_256.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,8 @@ auto row3210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row01_ne_0), BitCast(u16
auto row7654_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row45_ne_0), BitCast(u16, row67_ne_0));

/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
0x0102040810204080};
auto bitmap_mask = BitCast(u8, Load(u64, bm));
HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080};
auto bitmap_mask = BitCast(u8, LoadDup128(u64, bm));

auto bitmap_rows_3210 = AndNot(row3210_ne_0, bitmap_mask);
auto bitmap_rows_7654 = AndNot(row7654_ne_0, bitmap_mask);
Expand Down
6 changes: 2 additions & 4 deletions lib/block_coding_512.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,8 @@ auto row4567_ne_0 = VecFromMask(s16, Eq(row4567, zero));
auto row76543210_ne_0 = OrderedTruncate2To(u8, BitCast(u16, row0123_ne_0), BitCast(u16, row4567_ne_0));

/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
0x0102040810204080, 0x0102040810204080, 0x0102040810204080,
0x0102040810204080, 0x0102040810204080};
auto bitmap_mask = BitCast(u8, Load(u64, bm));
HWY_ALIGN constexpr uint64_t bm[] = {0x0102040810204080, 0x0102040810204080};
auto bitmap_mask = BitCast(u8, LoadDup128(u64, bm));

auto bitmap_rows_76543210 = AndNot(row76543210_ne_0, bitmap_mask);
auto a0 = SumsOf8(bitmap_rows_76543210);
Expand Down
8 changes: 4 additions & 4 deletions lib/color.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ HWY_ATTR void rgb2ycbcr(uint8_t *HWY_RESTRICT in, std::vector<uint8_t *> &out, i
auto v1 = Undefined(u8);
auto v2 = Undefined(u8);

uint8_t *o0 = out[0];
uint8_t *o1 = out[1];
uint8_t *o2 = out[2];
constexpr size_t N = Lanes(u8);
uint8_t *HWY_RESTRICT o0 = out[0];
uint8_t *HWY_RESTRICT o1 = out[1];
uint8_t *HWY_RESTRICT o2 = out[2];
constexpr size_t N = Lanes(u8);
for (size_t i = width * LINES; i > 0; i -= N) {
LoadInterleaved3(u8, in, v0, v1, v2);

Expand Down
3 changes: 2 additions & 1 deletion lib/image_chunk.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "constants.hpp"
#include "ycctype.hpp"
#include "hwy/ops/set_macros-inl.h"

class imchunk {
private:
Expand All @@ -27,7 +28,7 @@ class imchunk {
: width(w),
height(h),
ncomp(nc),
rounded_width(round_up(width, DCTSIZE * (YCC_HV[YCCtype][0] >> 4))),
rounded_width(round_up(width, HWY_MAX(DCTSIZE * (YCC_HV[YCCtype][0] >> 4), HWY_MAX_BYTES))),
origin(p),
g_buf(imdata),
buf(hwy::AllocateAligned<uint8_t>(static_cast<size_t>(width) * ncomp * LINES)),
Expand Down
4 changes: 2 additions & 2 deletions lib/jpegenc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ class jpeg_encoder_impl {
ncomp(inimg.nc),
QF(qf),
YCCtype(ycc),
rounded_width(round_up(inimg.width, DCTSIZE * (YCC_HV[YCCtype][0] >> 4))),
rounded_height(round_up(inimg.height, DCTSIZE * (YCC_HV[YCCtype][0] & 0xF))),
rounded_width(round_up(width, HWY_MAX(DCTSIZE * (YCC_HV[YCCtype][0] >> 4), HWY_MAX_BYTES))),
rounded_height(round_up(height, DCTSIZE * (YCC_HV[YCCtype][0] & 0xF))),
line_buffer0(ncomp),
line_buffer1(ncomp),
yuv0(ncomp),
Expand Down
1 change: 1 addition & 0 deletions lib/jpgheaders.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ void create_DHT(int c, bitstream &enc) {
}
}
std::vector<uint8_t> tmp;
tmp.reserve(256);
// Li
for (int f : freq) {
tmp.push_back(f);
Expand Down