diff --git a/apps/main_enc.cpp b/apps/main_enc.cpp index 83aa84e..ddc6f48 100644 --- a/apps/main_enc.cpp +++ b/apps/main_enc.cpp @@ -13,7 +13,7 @@ int main(int argc, char *argv[]) { return EXIT_FAILURE; } FILE *fp; - int fpos = read_pnm(fp, infile, width, height, nc); + size_t fpos = read_pnm(fp, infile, width, height, nc); jpegenc::im_info inimg(fp, fpos, width, height, nc); size_t duration = 0; @@ -31,7 +31,7 @@ int main(int argc, char *argv[]) { constexpr double warmuptime = 2000.0; // duration of warmup in milliseconds constexpr double benchtime = 1000.0; // duration of benchmark in milliseconds int iter = 0; - while (1) { + while (true) { encoder.invoke(); iter++; auto stop = std::chrono::high_resolution_clock::now() - start; diff --git a/include/jpegenc.hpp b/include/jpegenc.hpp index ae79a9f..470a589 100644 --- a/include/jpegenc.hpp +++ b/include/jpegenc.hpp @@ -17,11 +17,11 @@ namespace jpegenc { struct im_info { FILE *data; - const int32_t pos; + const size_t pos; const int32_t width; const int32_t height; const int32_t nc; - im_info(FILE *buf, int32_t fpos, int32_t w, int32_t h, int32_t c) + im_info(FILE *buf, size_t fpos, int32_t w, int32_t h, int32_t c) : data(buf), pos(fpos), width(w), height(h), nc(c) {} }; diff --git a/lib/bitstream.cpp b/lib/bitstream.cpp index 45a3080..218c163 100644 --- a/lib/bitstream.cpp +++ b/lib/bitstream.cpp @@ -28,7 +28,7 @@ HWY_ATTR void trial(uint8_t *HWY_RESTRICT in, uint8_t *HWY_RESTRICT out) { #if HWY_ONCE namespace jpegenc_hwy { HWY_EXPORT(trial); -void send_8_bytes(uint8_t *in, uint8_t *out) { +[[maybe_unused]] void send_8_bytes(uint8_t *in, uint8_t *out) { HWY_DYNAMIC_DISPATCH(trial) (in, out); } diff --git a/lib/bitstream.hpp b/lib/bitstream.hpp index f527dae..943caec 100644 --- a/lib/bitstream.hpp +++ b/lib/bitstream.hpp @@ -13,7 +13,7 @@ #define USE_VECTOR 0 namespace jpegenc_hwy { -void send_8_bytes(uint8_t *in, uint8_t *out); +[[maybe_unused]] void send_8_bytes(uint8_t *in, uint8_t *out); } // namespace jpegenc_hwy class stream_buf { @@ -54,15 +54,15 @@ class stream_buf { if (pos + 8 > len) { expand(); } - // // #if HWY_TARGET == HWY_NEON - // #if (HWY_TARGET | HWY_NEON_WITHOUT_AES) == HWY_NEON_WITHOUT_AES - // *(uint64_t *)cur_byte = __builtin_bswap64(val); - // #elif (HWY_TARGET | HWY_NEON) == HWY_NEON - // *(uint64_t *)cur_byte = __builtin_bswap64(val); - // #elif HWY_TARGET <= HWY_SSE2 - // *(uint64_t *)cur_byte = __bswap_64(val); - // #endif - jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte); + // emits eight uint8_t values at once +#if (HWY_TARGET | HWY_NEON_WITHOUT_AES) == HWY_NEON_WITHOUT_AES + *(uint64_t *)cur_byte = __builtin_bswap64(val); +#elif (HWY_TARGET | HWY_NEON) == HWY_NEON + *(uint64_t *)cur_byte = __builtin_bswap64(val); +#elif HWY_TARGET <= HWY_SSE2 + *(uint64_t *)cur_byte = __bswap_64(val); +#endif + // jpegenc_hwy::send_8_bytes((uint8_t *)&val, cur_byte); cur_byte += 8; pos += 8; } @@ -216,7 +216,8 @@ class bitstream { flush(); put_word(RST[n]); } - auto get_stream() { + + [[maybe_unused]] auto get_stream() { flush(); return &stream; } diff --git a/lib/block_coding.cpp b/lib/block_coding.cpp index acbaa44..4e0daac 100644 --- a/lib/block_coding.cpp +++ b/lib/block_coding.cpp @@ -10,7 +10,6 @@ #include "block_coding.hpp" #include "constants.hpp" #include "dct.hpp" -#include "huffman_tables.hpp" #include "quantization.hpp" #include "ycctype.hpp" diff --git a/lib/block_coding_256.cpp b/lib/block_coding_256.cpp index ccf5826..b7de151 100644 --- a/lib/block_coding_256.cpp +++ b/lib/block_coding_256.cpp @@ -30,20 +30,31 @@ auto row67 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[3 auto row23_1 = TwoTablesLookupLanes(s16, v2, v3, SetTableIndices(s16, &indices[4 * 16])); auto row45_1 = TwoTablesLookupLanes(s16, v0, v1, SetTableIndices(s16, &indices[5 * 16])); -HWY_ALIGN int16_t m[32] = { - -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -}; -auto maskv1 = Load(s16, m); -auto maskv2 = Load(s16, m + 16); -row23 = IfThenElseZero(MaskFromVec(maskv1), row23); -row45 = IfThenElseZero(MaskFromVec(maskv2), row45); -row23_1 = IfThenZeroElse(MaskFromVec(maskv1), row23_1); -row45_1 = IfThenZeroElse(MaskFromVec(maskv2), row45_1); -row23 = Or(row23, row23_1); -row45 = Or(row45, row45_1); -row01 = InsertLane(row01, 10, ExtractLane(v2, 0)); -row67 = InsertLane(row67, 5, ExtractLane(v1, 15)); +// HWY_ALIGN int16_t m[32] = { +// -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, +// -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, +// }; +// auto maskv1 = Load(s16, m); +// auto maskv2 = Load(s16, m + 16); +// +// row23 = IfThenElseZero(MaskFromVec(maskv1), row23); +// row45 = IfThenElseZero(MaskFromVec(maskv2), row45); +// row23_1 = IfThenZeroElse(MaskFromVec(maskv1), row23_1); +// row45_1 = IfThenZeroElse(MaskFromVec(maskv2), row45_1); + +HWY_ALIGN uint8_t m1[8] = {0x07, 0xFF}; +HWY_ALIGN uint8_t m2[8] = {0xFF, 0xE0}; +auto maskv1 = LoadMaskBits(s16, m1); +auto maskv2 = LoadMaskBits(s16, m2); + +row23 = IfThenElseZero(maskv1, row23); +row45 = IfThenElseZero(maskv2, row45); +row23_1 = IfThenZeroElse(maskv1, row23_1); +row45_1 = IfThenZeroElse(maskv2, row45_1); +row23 = Or(row23, row23_1); +row45 = Or(row45, row45_1); +row01 = InsertLane(row01, 10, ExtractLane(v2, 0)); +row67 = InsertLane(row67, 5, ExtractLane(v1, 15)); /* DCT block is now in zig-zag order; start Huffman encoding process. */ diff --git a/lib/color.cpp b/lib/color.cpp index 7f5c4b5..53d25b5 100644 --- a/lib/color.cpp +++ b/lib/color.cpp @@ -5,8 +5,6 @@ #include -#include - #include "color.hpp" #include "ycctype.hpp" #include "constants.hpp" diff --git a/lib/jpegenc.cpp b/lib/jpegenc.cpp index 14b7a48..b3d34ae 100644 --- a/lib/jpegenc.cpp +++ b/lib/jpegenc.cpp @@ -5,7 +5,6 @@ #include "block_coding.hpp" #include "color.hpp" #include "constants.hpp" -#include "dct.hpp" #include "image_chunk.hpp" #include "huffman_tables.hpp" #include "jpgheaders.hpp" diff --git a/lib/jpgheaders.cpp b/lib/jpgheaders.cpp index 931e5d3..6a3603a 100644 --- a/lib/jpgheaders.cpp +++ b/lib/jpgheaders.cpp @@ -1,4 +1,3 @@ -#include #include #include "bitstream.hpp" diff --git a/lib/quantization.cpp b/lib/quantization.cpp index 900600f..55eecfa 100644 --- a/lib/quantization.cpp +++ b/lib/quantization.cpp @@ -5,9 +5,6 @@ #include #include -#include -#include "ycctype.hpp" -#include "constants.hpp" #include "quantization.hpp" namespace jpegenc_hwy { diff --git a/lib/quantization.hpp b/lib/quantization.hpp index b6dc07a..47fef7a 100644 --- a/lib/quantization.hpp +++ b/lib/quantization.hpp @@ -39,6 +39,6 @@ constexpr float qmatrix[2][64] = { void create_scaled_qtable(int c, int QF, int16_t *qtable); namespace jpegenc_hwy { namespace HWY_NAMESPACE { -HWY_ATTR void quantize_core(int16_t *HWY_RESTRICT data, const int *HWY_RESTRICT qtable); +HWY_ATTR void quantize_core(int16_t *HWY_RESTRICT data, const int16_t *HWY_RESTRICT qtable); } // namespace HWY_NAMESPACE } // namespace jpegenc_hwy \ No newline at end of file