diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c20a730 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,17 @@ +# Set the default behavior, in case people don't have core.autocrlf set. +* text=auto + +# Explicitly declare text files you want to always be normalized and converted +# to native line endings on checkout. +*.c text +*.h text +*.cpp text +*.hpp text + +# Declare files that will always have CRLF line endings on checkout. +*.sln text eol=crlf + +# Denote all files that are truly binary and should not be modified. +*.pgx binary +*.j2k binary +*.ppm binary diff --git a/CHANGELOG b/CHANGELOG index df24fd7..0306b68 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,9 @@ +# [0.2.8] - 2024-11-12 + +* Fix incorrect packet parsing for RPCL, PCRL, CPRL +* Introduce stride access into DWT +* Change cmake configuration for MinGW environments + # [0.2.7] - 2024-06-13 * Refactor non-SIMD HT cleanup decoding diff --git a/CMakeLists.txt b/CMakeLists.txt index b3de7a1..b73d1c7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,9 +153,7 @@ endif() if(NOT EMSCRIPTEN) if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^[xX]86_64$|^[aA][mM][dD]64$") # x86_64 - if(NOT MINGW) - option(ENABLE_AVX2 "Enable the use of Intel AVX2 intrinsics" ON) - endif() + option(ENABLE_AVX2 "Enable the use of Intel AVX2 intrinsics" ON) if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /EHsc /D \"_CRT_SECURE_NO_WARNINGS\"") diff --git a/source/apps/imgcmp/image_class.hpp b/source/apps/imgcmp/image_class.hpp index c4e71ea..317a882 100644 --- a/source/apps/imgcmp/image_class.hpp +++ b/source/apps/imgcmp/image_class.hpp @@ -70,7 +70,7 @@ class image { bitDepth(0), isSigned(false), isBigendian(false), - data(nullptr) {}; + data(nullptr){}; // // destructor ~image() { delete[] data; } @@ -110,9 +110,9 @@ class image { // parsing PNM/PGX header int read_pnmpgx(const char *name) { - constexpr char SP = ' '; - constexpr char LF = '\n'; - constexpr char CR = 13; + constexpr char SP = ' '; + constexpr char LF = '\n'; + [[maybe_unused]] constexpr char CR = 13; FILE *fp = fopen(name, "rb"); if (fp == nullptr) { @@ -223,7 +223,7 @@ class image { } } // read numerical value - while (c != SP && c != LF && c != CR) { + while (c >= '0' && c <= '9') { val *= 10; val += c - '0'; c = fgetc(fp); @@ -255,8 +255,10 @@ class image { } } // easting trailing spaces/LF/CR or comments - c = fgetc(fp); + c = fgetc(fp); + int count = 0; while (c == SP || c == LF) { + count++; c = fgetc(fp); if (c == '#') { char *nouse = fgets(comment, sizeof(comment), fp); @@ -267,7 +269,7 @@ class image { c = fgetc(fp); } } - fseek(fp, -1, SEEK_CUR); + fseek(fp, -count - 1, SEEK_CUR); const uint_fast8_t nbytes = static_cast((bitDepth + 7) / 8); // ceil bitDepth to byte const size_t num_samples = this->width * this->height * num_components; diff --git a/source/core/coding/coding_units.cpp b/source/core/coding/coding_units.cpp index c08a108..555f495 100644 --- a/source/core/coding/coding_units.cpp +++ b/source/core/coding/coding_units.cpp @@ -267,8 +267,8 @@ void j2k_codeblock::create_compressed_buffer(buf_chain *tile_buf, int32_t buf_li *******************************************************************************/ j2k_precinct_subband::j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uint8_t R_b, uint8_t transformation, float stepsize, sprec_t *ibuf, - const element_siz &bp0, const element_siz &bp1, - const element_siz &p0, const element_siz &p1, + const element_siz &bp0, const element_siz &p0, + const element_siz &p1, const uint32_t band_stride, const uint16_t &num_layers, const element_siz &codeblock_size, const uint8_t &Cmodes) : j2k_region(p0, p1), @@ -290,7 +290,7 @@ j2k_precinct_subband::j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uin } const uint32_t num_codeblocks = this->num_codeblock_x * this->num_codeblock_y; - const uint32_t band_stride = bp1.x - bp0.x; + // const uint32_t band_stride = stride; if (num_codeblocks != 0) { inclusion_info = new tagtree(this->num_codeblock_x, this->num_codeblock_y); ZBP_info = new tagtree(this->num_codeblock_x, this->num_codeblock_y); @@ -551,9 +551,7 @@ void j2k_precinct_subband::parse_packet_header(buf_chain *packet_header, uint16_ if (!(block->Cmodes & HT_MIXED)) { // Must be the first HT Cleanup pass if (segment_bytes < 2) { - printf( - "ERROR: Length information for a HT-codeblock is " - "invalid\n"); + printf("ERROR: Length information for a HT-codeblock is invalid\n"); throw std::exception(); } next_segment_passes = 2; @@ -595,9 +593,7 @@ void j2k_precinct_subband::parse_packet_header(buf_chain *packet_header, uint16_ if (block->Cmodes & HT_MIXED) { block->Cmodes &= static_cast(~(HT_PHLD | HT)); } else { - printf( - "ERROR: Length information for a HT-codeblock is " - "invalid\n"); + printf("ERROR: Length information for a HT-codeblock is invalid\n"); throw std::exception(); } } @@ -1028,7 +1024,7 @@ j2k_precinct::j2k_precinct(const uint8_t &r, const uint32_t &idx, const element_ ceil_int(pos1.y - yob[subband[i]->orientation], sr)); this->pband[i] = MAKE_UNIQUE( subband[i]->orientation, subband[i]->M_b, subband[i]->R_b, subband[i]->transformation, - subband[i]->delta, subband[i]->i_samples, subband[i]->pos0, subband[i]->pos1, pbpos0, pbpos1, + subband[i]->delta, subband[i]->i_samples, subband[i]->pos0, pbpos0, pbpos1, subband[i]->stride, num_layers, codeblock_size, Cmodes); } } @@ -1059,8 +1055,9 @@ j2k_subband::j2k_subband(element_siz p0, element_siz p1, uint8_t orientation, ui if (num_samples) { if (orientation != BAND_LL) { // If not the lowest resolution, buffers for subbands shall be created. - i_samples = static_cast(aligned_mem_alloc(sizeof(sprec_t) * num_samples, 32)); - memset(i_samples, 0, sizeof(sprec_t) * num_samples); + i_samples = + static_cast(aligned_mem_alloc(sizeof(sprec_t) * this->stride * (pos1.y - pos0.y), 32)); + memset(i_samples, 0, sizeof(sprec_t) * this->stride * (pos1.y - pos0.y)); } else { i_samples = ibuf; } @@ -1096,10 +1093,12 @@ j2k_resolution::j2k_resolution(const uint8_t &r, const element_siz &p0, const el i_samples = nullptr; if (!is_empty) { if (index == 0) { - i_samples = static_cast(aligned_mem_alloc(sizeof(sprec_t) * num_samples, 32)); + i_samples = + static_cast(aligned_mem_alloc(sizeof(sprec_t) * this->stride * (pos1.y - pos0.y), 32)); memset(i_samples, 0, sizeof(sprec_t) * num_samples); } else { - i_samples = static_cast(aligned_mem_alloc(sizeof(sprec_t) * num_samples, 32)); + i_samples = + static_cast(aligned_mem_alloc(sizeof(sprec_t) * this->stride * (pos1.y - pos0.y), 32)); } } } @@ -2504,7 +2503,7 @@ void j2k_tile::decode() { // copy samples in resolution buffer to that in tile component buffer uint32_t height = tc1.y - tc0.y; - uint32_t width = tc1.x - tc0.x; + uint32_t width = round_up(tc1.x - tc0.x, 32U); uint32_t stride = round_up(width, 32U); // size_t num_samples = static_cast(tc1.x - tc0.x) * (tc1.y - tc0.y); #if defined(OPENHTJ2K_ENABLE_ARM_NEON) @@ -2664,7 +2663,7 @@ void j2k_tile::find_gcd_of_precinct_size(element_siz &out) { for (uint8_t r = 0; r <= this->tcomp[c].get_dwt_levels(); r++) { PP = this->tcomp[c].get_precinct_size(r); PPx = (PPx > PP.x) ? static_cast(PP.x) : PPx; - PPy = (PPy > PP.y) ? static_cast(PP.y) : PPx; + PPy = (PPy > PP.y) ? static_cast(PP.y) : PPy; } } out.x = PPx; @@ -3010,7 +3009,7 @@ uint8_t *j2k_tile::encode() { #if defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__) for (uint32_t y = 0; y < height; ++y) { int32_t *sp = src + y * stride; - sprec_t *dp = cr->i_samples + y * (bottom_right.x - top_left.x); + sprec_t *dp = cr->i_samples + y * stride; uint32_t num_tc_samples = bottom_right.x - top_left.x; for (; num_tc_samples >= 16; num_tc_samples -= 16) { __m256i v0 = _mm256_load_si256((__m256i *)sp); @@ -3027,7 +3026,7 @@ uint8_t *j2k_tile::encode() { #elif defined(OPENHTJ2K_ENABLE_ARM_NEON) for (uint32_t y = 0; y < height; ++y) { int32_t *sp = src + y * stride; - sprec_t *dp = cr->i_samples + y * (bottom_right.x - top_left.x); + sprec_t *dp = cr->i_samples + y * stride; uint32_t num_tc_samples = bottom_right.x - top_left.x; for (; num_tc_samples >= 8; num_tc_samples -= 8) { auto vsrc0 = vld1q_s32(sp); @@ -3043,7 +3042,7 @@ uint8_t *j2k_tile::encode() { #else for (uint32_t y = 0; y < height; ++y) { int32_t *sp = src + y * stride; - sprec_t *dp = cr->i_samples + y * (bottom_right.x - top_left.x); + sprec_t *dp = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32U); uint32_t num_tc_samples = bottom_right.x - top_left.x; for (; num_tc_samples > 0; --num_tc_samples) { *dp++ = static_cast(*sp++); diff --git a/source/core/coding/coding_units.hpp b/source/core/coding/coding_units.hpp index 6aa5e5d..a9d5963 100644 --- a/source/core/coding/coding_units.hpp +++ b/source/core/coding/coding_units.hpp @@ -41,6 +41,9 @@ class j2k_region { element_siz pos0; // bottom-right coordinate (exclusive) of a region in the reference grid element_siz pos1; + // width for line buffer + uint32_t stride; + // return top-left coordinate (inclusive) [[nodiscard]] element_siz get_pos0() const { return pos0; } // return bottom-right coordinate (exclusive) @@ -55,7 +58,7 @@ class j2k_region { // set bottom-right coordinate (exclusive) void set_pos1(element_siz in) { pos1 = in; } j2k_region() = default; - j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1) {} + j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1), stride(round_up(pos1.x - pos0.x, 32U)) {} }; /******************************************************************************** @@ -176,8 +179,8 @@ class j2k_precinct_subband : public j2k_region { uint32_t num_codeblock_x; uint32_t num_codeblock_y; j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uint8_t R_b, uint8_t transformation, - float stepsize, sprec_t *ibuf, const element_siz &bp0, const element_siz &bp1, - const element_siz &p0, const element_siz &p1, const uint16_t &num_layers, + float stepsize, sprec_t *ibuf, const element_siz &bp0, const element_siz &p0, + const element_siz &p1, const uint32_t stride, const uint16_t &num_layers, const element_siz &codeblock_size, const uint8_t &Cmodes); ~j2k_precinct_subband() { delete inclusion_info; @@ -312,7 +315,7 @@ class j2k_resolution : public j2k_region { void scale(); void destroy() { aligned_mem_free(i_samples); - for (auto b = 0; b < num_bands; ++b) { + for (uint8_t b = 0; b < num_bands; ++b) { if (subbands != nullptr) { subbands[b]->destroy(); } @@ -420,7 +423,7 @@ class j2k_tile_component : public j2k_tile_base { void perform_dc_offset(uint8_t transformation, bool is_signed); void destroy() { - for (auto r = 0; r < this->NL; ++r) { + for (uint8_t r = 0; r < this->NL; ++r) { if (resolution != nullptr) { auto p = resolution[r].get(); if (p != nullptr) resolution[r]->destroy(); @@ -491,7 +494,7 @@ class j2k_tile : public j2k_tile_base { public: j2k_tile(); void destroy() { - for (auto c = 0; c < this->num_components; ++c) { + for (uint16_t c = 0; c < this->num_components; ++c) { tcomp[c].destroy(); } } diff --git a/source/core/coding/ht_block_decoding.cpp b/source/core/coding/ht_block_decoding.cpp index ec2373c..2fddb92 100644 --- a/source/core/coding/ht_block_decoding.cpp +++ b/source/core/coding/ht_block_decoding.cpp @@ -53,6 +53,422 @@ uint8_t j2k_codeblock::calc_mbr(const uint32_t i, const uint32_t j, const uint8_ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t Lcup, const int32_t Pcup, const int32_t Scup) { + uint8_t *compressed_data = block->get_compressed_data(); + const uint16_t QW = static_cast(ceil_int(static_cast(block->size.x), 2)); + const uint16_t QH = static_cast(ceil_int(static_cast(block->size.y), 2)); + + uint16_t scratch[8 * 513] = {0}; + int32_t sstr = static_cast(((block->size.x + 2) + 7u) & ~7u); // multiples of 8 + uint16_t *sp; + int32_t qx; + /*******************************************************************************************************************/ + // VLC, UVLC and MEL decoding + /*******************************************************************************************************************/ + MEL_dec MEL(compressed_data, Lcup, Scup); + rev_buf VLC_dec(compressed_data, Lcup, Scup); + auto sp0 = block->block_states + 1 + block->blkstate_stride; + auto sp1 = block->block_states + 1 + 2 * block->blkstate_stride; + uint32_t u_off0, u_off1; + uint32_t u0, u1; + uint32_t context = 0; + uint32_t vlcval; + + const uint16_t *dec_table; + // Initial line-pair + dec_table = dec_CxtVLC_table0_fast_16; + sp = scratch; + int32_t mel_run = MEL.get_run(); + for (qx = QW; qx > 0; qx -= 2, sp += 4) { + // Decoding of significance and EMB patterns and unsigned residual offsets + vlcval = VLC_dec.fetch(); + uint16_t tv0 = dec_table[(vlcval & 0x7F) + context]; + if (context == 0) { + mel_run -= 2; + tv0 = (mel_run == -1) ? tv0 : 0; + if (mel_run < 0) { + mel_run = MEL.get_run(); + } + } + sp[0] = tv0; + + // calculate context for the next quad, Eq. (1) in the spec + context = ((tv0 & 0xE0U) << 2) | ((tv0 & 0x10U) << 3); // = context << 7 + + // Decoding of significance and EMB patterns and unsigned residual offsets + vlcval = VLC_dec.advance((tv0 & 0x000F) >> 1); + uint16_t tv1 = dec_table[(vlcval & 0x7F) + context]; + if (context == 0 && qx > 1) { + mel_run -= 2; + tv1 = (mel_run == -1) ? tv1 : 0; + if (mel_run < 0) { + mel_run = MEL.get_run(); + } + } + tv1 = (qx > 1) ? tv1 : 0; + sp[2] = tv1; + + // store sigma + *sp0++ = ((tv0 >> 4) >> 0) & 1; + *sp0++ = ((tv0 >> 4) >> 2) & 1; + *sp0++ = ((tv1 >> 4) >> 0) & 1; + *sp0++ = ((tv1 >> 4) >> 2) & 1; + *sp1++ = ((tv0 >> 4) >> 1) & 1; + *sp1++ = ((tv0 >> 4) >> 3) & 1; + *sp1++ = ((tv1 >> 4) >> 1) & 1; + *sp1++ = ((tv1 >> 4) >> 3) & 1; + + // calculate context for the next quad, Eq. (1) in the spec + context = ((tv1 & 0xE0U) << 2) | ((tv1 & 0x10U) << 3); // = context << 7 + + vlcval = VLC_dec.advance((tv1 & 0x000F) >> 1); + u_off0 = tv0 & 1; + u_off1 = tv1 & 1; + + uint32_t mel_offset = 0; + if (u_off0 == 1 && u_off1 == 1) { + mel_run -= 2; + mel_offset = (mel_run == -1) ? 0x40 : 0; + if (mel_run < 0) { + mel_run = MEL.get_run(); + } + } + + // UVLC decoding + uint32_t idx = (vlcval & 0x3F) + (u_off0 << 6U) + (u_off1 << 7U) + mel_offset; + uint32_t uvlc_result = uvlc_dec_0[idx]; + // remove total prefix length + vlcval = VLC_dec.advance(uvlc_result & 0x7); + uvlc_result >>= 3; + // extract suffixes for quad 0 and 1 + uint32_t len = uvlc_result & 0xF; // suffix length for 2 quads (up to 10 = 5 + 5) + // ((1U << len) - 1U) can be replaced with _bzhi_u32(UINT32_MAX, len); not fast + uint32_t tmp = vlcval & ((1U << len) - 1U); // suffix value for 2 quads + vlcval = VLC_dec.advance(len); + uvlc_result >>= 4; + // quad 0 length + len = uvlc_result & 0x7; // quad 0 suffix length + uvlc_result >>= 3; + // U = 1+ u + u0 = 1 + (uvlc_result & 7) + (tmp & ~(0xFFU << len)); // always kappa = 1 in initial line pair + u1 = 1 + (uvlc_result >> 3) + (tmp >> len); // always kappa = 1 in initial line pair + + sp[1] = static_cast(u0); + sp[3] = static_cast(u1); + } + // sp[0] = sp[1] = 0; + + // Non-initial line-pair + dec_table = dec_CxtVLC_table1_fast_16; + for (uint16_t row = 1; row < QH; row++) { + sp0 = block->block_states + (row * 2U + 1U) * block->blkstate_stride + 1U; + sp1 = sp0 + block->blkstate_stride; + + sp = scratch + row * sstr; + // calculate context for the next quad: w, sw, nw are always 0 at the head of a row + context = ((sp[0 - sstr] & 0xA0U) << 2) | ((sp[2 - sstr] & 0x20U) << 4); + for (qx = QW; qx > 0; qx -= 2, sp += 4) { + // Decoding of significance and EMB patterns and unsigned residual offsets + vlcval = VLC_dec.fetch(); + uint16_t tv0 = dec_table[(vlcval & 0x7F) + context]; + if (context == 0) { + mel_run -= 2; + tv0 = (mel_run == -1) ? tv0 : 0; + if (mel_run < 0) { + mel_run = MEL.get_run(); + } + } + // calculate context for the next quad, Eq. (2) in the spec + context = ((tv0 & 0x40U) << 2) | ((tv0 & 0x80U) << 1); // (w | sw) << 8 + context |= (sp[0 - sstr] & 0x80U) | ((sp[2 - sstr] & 0xA0U) << 2); // ((nw | n) << 7) | (ne << 9) + context |= (sp[4 - sstr] & 0x20U) << 4; // ( nf) << 9 + + sp[0] = tv0; + + vlcval = VLC_dec.advance((tv0 & 0x000F) >> 1); + + // Decoding of significance and EMB patterns and unsigned residual offsets + uint16_t tv1 = dec_table[(vlcval & 0x7F) + context]; + if (context == 0 && qx > 1) { + mel_run -= 2; + tv1 = (mel_run == -1) ? tv1 : 0; + if (mel_run < 0) { + mel_run = MEL.get_run(); + } + } + tv1 = (qx > 1) ? tv1 : 0; + // calculate context for the next quad, Eq. (2) in the spec + context = ((tv1 & 0x40U) << 2) | ((tv1 & 0x80U) << 1); // (w | sw) << 8 + context |= (sp[2 - sstr] & 0x80U) | ((sp[4 - sstr] & 0xA0U) << 2); // ((nw | n) << 7) | (ne << 9) + context |= (sp[6 - sstr] & 0x20U) << 4; // ( nf) << 9 + + sp[2] = tv1; + + // store sigma + *sp0++ = ((tv0 >> 4) >> 0) & 1; + *sp0++ = ((tv0 >> 4) >> 2) & 1; + *sp0++ = ((tv1 >> 4) >> 0) & 1; + *sp0++ = ((tv1 >> 4) >> 2) & 1; + *sp1++ = ((tv0 >> 4) >> 1) & 1; + *sp1++ = ((tv0 >> 4) >> 3) & 1; + *sp1++ = ((tv1 >> 4) >> 1) & 1; + *sp1++ = ((tv1 >> 4) >> 3) & 1; + + vlcval = VLC_dec.advance((tv1 & 0x000F) >> 1); + + // UVLC decoding + u_off0 = tv0 & 1; + u_off1 = tv1 & 1; + uint32_t idx = (vlcval & 0x3F) + (u_off0 << 6U) + (u_off1 << 7U); + + uint32_t uvlc_result = uvlc_dec_1[idx]; + // remove total prefix length + vlcval = VLC_dec.advance(uvlc_result & 0x7); + uvlc_result >>= 3; + // extract suffixes for quad 0 and 1 + uint32_t len = uvlc_result & 0xF; // suffix length for 2 quads (up to 10 = 5 + 5) + // ((1U << len) - 1U) can be replaced with _bzhi_u32(UINT32_MAX, len); not fast + uint32_t tmp = vlcval & ((1U << len) - 1U); // suffix value for 2 quads + vlcval = VLC_dec.advance(len); + uvlc_result >>= 4; + // quad 0 length + len = uvlc_result & 0x7; // quad 0 suffix length + uvlc_result >>= 3; + u0 = (uvlc_result & 7) + (tmp & ~(0xFFU << len)); + u1 = (uvlc_result >> 3) + (tmp >> len); + + sp[1] = static_cast(u0); + sp[3] = static_cast(u1); + } + // sp[0] = sp[1] = 0; + } + + /*******************************************************************************************************************/ + // MagSgn decoding + /*******************************************************************************************************************/ + { + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We need an extra entry to handle the case of vp[1] + // when vp is at the last column. + // Here, we allocate 4 instead of 1 to make the buffer size + // a multipled of 16 bytes. + const int v_n_size = 512 + 4; + uint32_t v_n_scratch[v_n_size] = {0}; // 2+ kB + + fwd_buf<0xFF> MagSgn(compressed_data, Pcup); + + uint16_t *sp = scratch; + uint32_t *vp = v_n_scratch; + int32_t *dp = block->sample_buf; + + uint32_t prev_v_n = 0; + for (uint32_t x = 0; x < block->size.x; sp += 2, ++vp) { + uint32_t inf = sp[0]; + uint32_t U_q = sp[1]; + if (U_q > ((30U - pLSB) + 2U)) { + printf("ERROR\n"); + } + + uint32_t v_n; + uint32_t val = 0; + uint32_t bit = 0; + if (inf & (1 << (4 + bit))) { + // get 32 bits of magsgn data + uint32_t ms_val = MagSgn.fetch(); + uint32_t m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + MagSgn.advance(m_n); // consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (pLSB - 1); + } + dp[0] = static_cast(val); + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) { + // get 32 bits of magsgn data + uint32_t ms_val = MagSgn.fetch(); + uint32_t m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + MagSgn.advance(m_n); // consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (pLSB - 1); + } + dp[block->blksampl_stride] = static_cast(val); + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= block->size.x) { + ++vp; + break; + } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) { + // get 32 bits of magsgn data + uint32_t ms_val = MagSgn.fetch(); + uint32_t m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + MagSgn.advance(m_n); // consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (pLSB - 1); + } + dp[0] = static_cast(val); + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) { + // get 32 bits of magsgn data + uint32_t ms_val = MagSgn.fetch(); + uint32_t m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + MagSgn.advance(m_n); // consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (pLSB - 1); + } + dp[block->blksampl_stride] = static_cast(val); + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + + for (uint32_t y = 2; y < block->size.y; y += 2) { + uint16_t *sp = scratch + (y >> 1) * static_cast(sstr); + uint32_t *vp = v_n_scratch; + int32_t *dp = block->sample_buf + y * block->blksampl_stride; + + prev_v_n = 0; + for (uint32_t x = 0; x < block->size.x; sp += 2, ++vp) { + uint32_t inf = sp[0]; + uint32_t u_q = sp[1]; + + uint32_t gamma = inf & 0xF0; + gamma &= gamma - 0x10; // is gamma_q 1? + uint32_t emax = vp[0] | vp[1]; + emax = 31 - count_leading_zeros(emax | 2); // emax - 1 + uint32_t kappa = gamma ? emax : 1; + + uint32_t U_q = u_q + kappa; + if (U_q > ((30U - pLSB) + 2U)) { + printf("ERROR\n"); + } + + uint32_t v_n; + uint32_t val = 0; + uint32_t bit = 0; + if (inf & (1 << (4 + bit))) { + // get 32 bits of magsgn data + uint32_t ms_val = MagSgn.fetch(); + uint32_t m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + MagSgn.advance(m_n); // consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (pLSB - 1); + } + dp[0] = static_cast(val); + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) { + // get 32 bits of magsgn data + uint32_t ms_val = MagSgn.fetch(); + uint32_t m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + MagSgn.advance(m_n); // consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (pLSB - 1); + } + dp[block->blksampl_stride] = static_cast(val); + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= block->size.x) { + ++vp; + break; + } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) { + // get 32 bits of magsgn data + uint32_t ms_val = MagSgn.fetch(); + uint32_t m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + MagSgn.advance(m_n); // consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (pLSB - 1); + } + dp[0] = static_cast(val); + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) { + // get 32 bits of magsgn data + uint32_t ms_val = MagSgn.fetch(); + uint32_t m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + MagSgn.advance(m_n); // consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (pLSB - 1); + } + dp[block->blksampl_stride] = static_cast(val); + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + } + } +} + +void ht_cleanup_decode2(j2k_codeblock *block, const uint8_t &pLSB, const int32_t Lcup, const int32_t Pcup, + const int32_t Scup) { fwd_buf<0xFF> MagSgn(block->get_compressed_data(), Pcup); MEL_dec MEL(block->get_compressed_data(), Lcup, Scup); rev_buf VLC_dec(block->get_compressed_data(), Lcup, Scup); diff --git a/source/core/coding/ht_block_decoding.hpp b/source/core/coding/ht_block_decoding.hpp index 4752d81..cd2c185 100644 --- a/source/core/coding/ht_block_decoding.hpp +++ b/source/core/coding/ht_block_decoding.hpp @@ -866,7 +866,7 @@ class fwd_buf { */ FORCE_INLINE void advance(uint32_t num_bits) { // if (!num_bits) return; - if (!(num_bits >= 0 && num_bits <= this->bits && num_bits < 128)) { + if (!(num_bits <= this->bits && num_bits < 128)) { printf("Value of numbits = %d is out of range.\n", num_bits); throw std::exception(); } diff --git a/source/core/coding/ht_block_encoding_avx2.hpp b/source/core/coding/ht_block_encoding_avx2.hpp index 8f4a44d..6c971fb 100644 --- a/source/core/coding/ht_block_encoding_avx2.hpp +++ b/source/core/coding/ht_block_encoding_avx2.hpp @@ -85,22 +85,22 @@ class state_MS_enc { uint32_t t = 0; // _bzhi_u32(UINT32_MAX, len) = ((1U << len) - 1U) - tmp = val & ((1 << (8 - stuff)) - 1); + tmp = val & ((1U << (8U - stuff)) - 1U); t |= tmp; bits_local += 8 - stuff; stuff = (tmp == 0xFF); - tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1); + tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U); t |= tmp << 8; bits_local += 8 - stuff; stuff = (tmp == 0xFF); - tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1); + tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U); t |= tmp << 16; bits_local += 8 - stuff; stuff = (tmp == 0xFF); - tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1); + tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U); t |= tmp << 24; bits_local += 8 - stuff; last = tmp & 0xFF; @@ -168,7 +168,7 @@ class state_MS_enc { } #else for (int i = 0; i < 4; ++i) { - Creg |= static_cast<__uint128_t>(_mm_cvtsi128_si32(v)) << ctreg; + Creg |= static_cast(static_cast<__uint128_t>(_mm_cvtsi128_si32(v)) << ctreg); ctreg += static_cast(_mm_cvtsi128_si32(m)); v = _mm_srli_si128(v, 4); m = _mm_srli_si128(m, 4); diff --git a/source/core/common/open_htj2k_version.hpp b/source/core/common/open_htj2k_version.hpp index 028a808..ead0ea1 100644 --- a/source/core/common/open_htj2k_version.hpp +++ b/source/core/common/open_htj2k_version.hpp @@ -28,4 +28,4 @@ #define OPENHTJ2K_VERSION_MAJOR 0 #define OPENHTJ2K_VERSION_MINOR 2 -#define OPENHTJ2K_VERSION_PATCH 7 +#define OPENHTJ2K_VERSION_PATCH 8 diff --git a/source/core/transform/color.cpp b/source/core/transform/color.cpp index 48df1ab..f63aba9 100644 --- a/source/core/transform/color.cpp +++ b/source/core/transform/color.cpp @@ -26,7 +26,7 @@ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#if !defined(OPENHTJ2K_TRY_AVX2) && !defined(__AVX2__) && !defined(OPENHTJ2K_ENABLE_ARM_NEON) +#if not defined(OPENHTJ2K_TRY_AVX2) && not defined(OPENHTJ2K_ENABLE_ARM_NEON) #include "color.hpp" void cvt_rgb_to_ycbcr_rev(int32_t *sp0, int32_t *sp1, int32_t *sp2, uint32_t width, uint32_t height) { diff --git a/source/core/transform/dwt.hpp b/source/core/transform/dwt.hpp index 5d50078..e34c546 100644 --- a/source/core/transform/dwt.hpp +++ b/source/core/transform/dwt.hpp @@ -60,11 +60,11 @@ constexpr int32_t Dshift = 16; // define pointer to FDWT functions typedef void (*fdwt_1d_filtr_func_fixed)(sprec_t *, const int32_t, const int32_t, const int32_t); typedef void (*fdwt_ver_filtr_func_fixed)(sprec_t *, const int32_t, const int32_t, const int32_t, - const int32_t); + const int32_t, const int32_t stride); // define pointer to IDWT functions typedef void (*idwt_1d_filtd_func_fixed)(sprec_t *, const int32_t, const int32_t, const int32_t); typedef void (*idwt_ver_filtd_func_fixed)(sprec_t *, const int32_t, const int32_t, const int32_t, - const int32_t); + const int32_t, const int32_t stride); // symmetric extension static inline int32_t PSEo(const int32_t i, const int32_t i0, const int32_t i1) { @@ -90,22 +90,24 @@ static inline void dwt_1d_extr_fixed(T *extbuf, T *buf, const int32_t left, cons #if defined(OPENHTJ2K_ENABLE_ARM_NEON) void fdwt_1d_filtr_irrev97_fixed_neon(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); void fdwt_1d_filtr_rev53_fixed_neon(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); -void fdwt_irrev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); -void fdwt_rev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); +void fdwt_irrev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, + int32_t stride); +void fdwt_rev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, + int32_t stride); #elif defined(OPENHTJ2K_ENABLE_AVX2) void fdwt_1d_filtr_irrev97_fixed_avx2(sprec_t *X, const int32_t left, const int32_t u_i0, const int32_t u_i1); void fdwt_1d_filtr_rev53_fixed_avx2(sprec_t *X, const int32_t left, const int32_t u_i0, const int32_t u_i1); void fdwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1); + const int32_t v1, const int32_t stride); void fdwt_rev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1); + const int32_t v1, const int32_t stride); #else void fdwt_1d_filtr_irrev97_fixed(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); void fdwt_1d_filtr_rev53_fixed(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); -void fdwt_irrev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); -void fdwt_rev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); +void fdwt_irrev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, int32_t stride); +void fdwt_rev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, int32_t stride); #endif void fdwt_2d_sr_fixed(sprec_t *previousLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH, int32_t u0, @@ -115,18 +117,22 @@ void fdwt_2d_sr_fixed(sprec_t *previousLL, sprec_t *LL, sprec_t *HL, sprec_t *LH #if defined(OPENHTJ2K_ENABLE_ARM_NEON) void idwt_1d_filtr_rev53_fixed_neon(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); void idwt_1d_filtr_irrev97_fixed_neon(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); -void idwt_irrev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); -void idwt_rev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); +void idwt_irrev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, + int32_t stride); +void idwt_rev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, + int32_t stride); #elif defined(OPENHTJ2K_ENABLE_AVX2) void idwt_1d_filtr_rev53_fixed_avx2(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); void idwt_1d_filtr_irrev97_fixed_avx2(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); -void idwt_irrev_ver_sr_fixed_avx2(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); -void idwt_rev_ver_sr_fixed_avx2(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); +void idwt_irrev_ver_sr_fixed_avx2(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, + int32_t stride); +void idwt_rev_ver_sr_fixed_avx2(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, + int32_t stride); #else void idwt_1d_filtr_rev53_fixed(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); void idwt_1d_filtr_irrev97_fixed(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1); -void idwt_irrev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); -void idwt_rev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1); +void idwt_irrev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, int32_t stride); +void idwt_rev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, int32_t stride); #endif void idwt_2d_sr_fixed(sprec_t *nextLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH, int32_t u0, int32_t u1, int32_t v0, int32_t v1, uint8_t transformation, diff --git a/source/core/transform/fdwt.cpp b/source/core/transform/fdwt.cpp index e1ef89d..0459573 100644 --- a/source/core/transform/fdwt.cpp +++ b/source/core/transform/fdwt.cpp @@ -107,8 +107,7 @@ static inline void fdwt_1d_sr_fixed(sprec_t *buf, sprec_t *in, const int32_t lef // FDWT for horizontal direction static void fdwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1, const uint8_t transformation) { - const int32_t stride = u1 - u0; + const int32_t v1, const uint8_t transformation, const int32_t stride) { constexpr int32_t num_pse_i0[2][2] = {{4, 2}, {3, 1}}; constexpr int32_t num_pse_i1[2][2] = {{3, 1}, {4, 2}}; const int32_t left = num_pse_i0[u0 % 2][transformation]; @@ -118,16 +117,17 @@ static void fdwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c // one sample case for (int32_t row = 0; row < v1 - v0; ++row) { if (u0 % 2 == 0) { - in[row] = (transformation) ? in[row] : in[row]; + in[row * stride] = (transformation) ? in[row * stride] : in[row * stride]; } else { - in[row] = (transformation) ? static_cast(in[row] << 1) : in[row]; + in[row * stride] = + (transformation) ? static_cast(in[row * stride] << 1) : in[row * stride]; } } } else { // need to perform symmetric extension const int32_t len = u1 - u0 + left + right; auto *Xext = static_cast(aligned_mem_alloc( - sizeof(sprec_t) * static_cast(round_up(len + SIMD_PADDING, SIMD_PADDING)), 32)); + sizeof(sprec_t) * static_cast(round_up(len + SIMD_PADDING, SIMD_PADDING)), 32)); // #pragma omp parallel for for (int32_t row = 0; row < v1 - v0; ++row) { fdwt_1d_sr_fixed(Xext, in, left, right, u0, u1, transformation); @@ -138,8 +138,7 @@ static void fdwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c } void fdwt_irrev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {4, 3}; constexpr int32_t num_pse_i1[2] = {3, 4}; const int32_t top = num_pse_i0[v0 % 2]; @@ -215,8 +214,7 @@ void fdwt_irrev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, co } void fdwt_rev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {2, 1}; constexpr int32_t num_pse_i1[2] = {1, 2}; const int32_t top = num_pse_i0[v0 % 2]; @@ -280,8 +278,8 @@ void fdwt_rev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, cons // Deinterleaving to devide coefficients into subbands static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t *const HL, sprec_t *const LH, sprec_t *const HH, const int32_t u0, - const int32_t u1, const int32_t v0, const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t u1, const int32_t v0, const int32_t v1, + const int32_t stride) { const int32_t v_offset = v0 % 2; const int32_t u_offset = u0 % 2; sprec_t *dp[4] = {LL, HL, LH, HH}; @@ -291,41 +289,43 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t const int32_t ustop[4] = {ceil_int(u1, 2), u1 / 2, ceil_int(u1, 2), u1 / 2}; const int32_t voffset[4] = {v_offset, v_offset, 1 - v_offset, 1 - v_offset}; const int32_t uoffset[4] = {u_offset, 1 - u_offset, u_offset, 1 - u_offset}; + const int32_t stride2[4] = {round_up(ustop[0] - ustart[0], 32), round_up(ustop[1] - ustart[1], 32), + round_up(ustop[2] - ustart[2], 32), round_up(ustop[3] - ustart[3], 32)}; #if defined(OPENHTJ2K_ENABLE_ARM_NEON) if ((ustop[0] - ustart[0]) != (ustop[1] - ustart[1])) { for (uint8_t b = 0; b < 2; ++b) { for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = dp[b] + v * stride2[b]; for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; + *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; } } } } else { sprec_t *first, *second; - first = LL; - second = HL; + first = dp[0]; + second = dp[1]; if (uoffset[0] > uoffset[1]) { - first = HL; - second = LL; + first = dp[1]; + second = dp[0]; } for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) { - sprec_t *sp = buf + (2 * v + voffset[0]) * stride; - size_t len = static_cast(ustop[0] - ustart[0]); + sprec_t *sp = buf + (2 * v + voffset[0]) * stride; + size_t len = static_cast(ustop[0] - ustart[0]); + sprec_t *line0 = first + v * stride2[0]; + sprec_t *line1 = second + v * stride2[0]; for (; len >= 8; len -= 8) { - openhtj2k_arm_prefetch2(sp, 0); - openhtj2k_arm_prefetch2(first, 1); - openhtj2k_arm_prefetch2(second, 1); auto vline = vld2q_s16(sp); - vst1q_s16(first, vline.val[0]); - vst1q_s16(second, vline.val[1]); - first += 8; - second += 8; + vst1q_s16(line0, vline.val[0]); + vst1q_s16(line1, vline.val[1]); + line0 += 8; + line1 += 8; sp += 16; } for (; len > 0; --len) { - *first++ = *sp++; - *second++ = *sp++; + *line0++ = *sp++; + *line1++ = *sp++; } } } @@ -333,63 +333,66 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t if ((ustop[2] - ustart[2]) != (ustop[3] - ustart[3])) { for (uint8_t b = 2; b < 4; ++b) { for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = dp[b] + v * stride2[b]; for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; + *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; } } } } else { sprec_t *first, *second; - first = LH; - second = HH; + first = dp[2]; + second = dp[3]; if (uoffset[2] > uoffset[3]) { - first = HH; - second = LH; + first = dp[3]; + second = dp[2]; } for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) { - sprec_t *sp = buf + (2 * v + voffset[2]) * stride; - size_t len = static_cast(ustop[2] - ustart[2]); + sprec_t *sp = buf + (2 * v + voffset[2]) * stride; + size_t len = static_cast(ustop[2] - ustart[2]); + sprec_t *line0 = first + v * stride2[2]; + sprec_t *line1 = second + v * stride2[2]; for (; len >= 8; len -= 8) { - openhtj2k_arm_prefetch2(sp, 0); - openhtj2k_arm_prefetch2(first, 1); - openhtj2k_arm_prefetch2(second, 1); auto vline = vld2q_s16(sp); - vst1q_s16(first, vline.val[0]); - vst1q_s16(second, vline.val[1]); - first += 8; - second += 8; + vst1q_s16(line0, vline.val[0]); + vst1q_s16(line1, vline.val[1]); + line0 += 8; + line1 += 8; sp += 16; } for (; len > 0; --len) { - *first++ = *sp++; - *second++ = *sp++; + *line0++ = *sp++; + *line1++ = *sp++; } } } #elif defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__) if ((ustop[0] - ustart[0]) != (ustop[1] - ustart[1])) { - for (uint8_t b = 0; b < 2; ++b) { - for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { - for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; + for (uint8_t b = 0; b < 2; ++b) { + for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = dp[b] + v * stride2[b]; + for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { + *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; } } } } else { - sprec_t *first, *second; - first = LL; - second = HL; - if (uoffset[0] > uoffset[1]) { - first = HL; - second = LL; - } - const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, + sprec_t *first, *second; + first = dp[0]; + second = dp[1]; + if (uoffset[0] > uoffset[1]) { + first = dp[1]; + second = dp[0]; + } + const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0); - for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) { - sprec_t *sp = buf + (2 * v + voffset[0]) * stride; - size_t len = static_cast(ustop[0] - ustart[0]); - for (; len >= 8; len -= 8) { - // SSE version + for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) { + sprec_t *sp = buf + (2 * v + voffset[0]) * stride; + size_t len = static_cast(ustop[0] - ustart[0]); + sprec_t *line0 = first + v * stride2[0]; + sprec_t *line1 = second + v * stride2[0]; + for (; len >= 8; len -= 8) { + // SSE version // auto vline0 = _mm_loadu_si128((__m128i *)sp); // auto vline1 = _mm_loadu_si128((__m128i *)(sp + 8)); // vline0 = _mm_shufflelo_epi16(vline0, _MM_SHUFFLE(3, 1, 2, 0)); @@ -398,47 +401,50 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t // vline1 = _mm_shufflehi_epi16(vline1, _MM_SHUFFLE(3, 1, 2, 0)); // vline0 = _mm_shuffle_epi32(vline0, _MM_SHUFFLE(3, 1, 2, 0)); // A1 A2 A3 A4 B1 B2 B3 B4 // vline1 = _mm_shuffle_epi32(vline1, _MM_SHUFFLE(3, 1, 2, 0)); // A5 A6 A7 A8 B5 B6 B7 B8 - // _mm_storeu_si128((__m128i *)first, _mm_unpacklo_epi64(vline0, vline1)); - // _mm_storeu_si128((__m128i *)second, _mm_unpackhi_epi64(vline0, vline1)); + // _mm_storeu_si128((__m128i *)line0, _mm_unpacklo_epi64(vline0, vline1)); + // _mm_storeu_si128((__m128i *)line1, _mm_unpackhi_epi64(vline0, vline1)); __m256i vline = _mm256_loadu_si256((__m256i *)sp); vline = _mm256_shuffle_epi8(vline, vshmask); vline = _mm256_permute4x64_epi64(vline, 0xD8); - _mm256_storeu2_m128i((__m128i *)second, (__m128i *)first, vline); - first += 8; - second += 8; + _mm256_storeu2_m128i((__m128i *)line1, (__m128i *)line0, vline); + line0 += 8; + line1 += 8; sp += 16; } for (; len > 0; --len) { - *first++ = *sp++; - *second++ = *sp++; + *line0++ = *sp++; + *line1++ = *sp++; } } } if ((ustop[2] - ustart[2]) != (ustop[3] - ustart[3])) { - for (uint8_t b = 2; b < 4; ++b) { - for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { - for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; + for (uint8_t b = 2; b < 4; ++b) { + for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = dp[b] + v * stride2[b]; + for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { + *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; } } } } else { - sprec_t *first, *second; - first = LH; - second = HH; - if (uoffset[2] > uoffset[3]) { - first = HH; - second = LH; - } - const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, + sprec_t *first, *second; + first = dp[2]; + second = dp[3]; + if (uoffset[2] > uoffset[3]) { + first = dp[3]; + second = dp[2]; + } + const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0); - for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) { - sprec_t *sp = buf + (2 * v + voffset[2]) * stride; - size_t len = static_cast(ustop[2] - ustart[2]); - for (; len >= 8; len -= 8) { - // SSE version + for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) { + sprec_t *sp = buf + (2 * v + voffset[2]) * stride; + size_t len = static_cast(ustop[2] - ustart[2]); + sprec_t *line0 = first + v * stride2[2]; + sprec_t *line1 = second + v * stride2[2]; + for (; len >= 8; len -= 8) { + // SSE version // auto vline0 = _mm_loadu_si128((__m128i *)sp); // auto vline1 = _mm_loadu_si128((__m128i *)(sp + 8)); // vline0 = _mm_shufflelo_epi16(vline0, _MM_SHUFFLE(3, 1, 2, 0)); @@ -447,28 +453,29 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t // vline1 = _mm_shufflehi_epi16(vline1, _MM_SHUFFLE(3, 1, 2, 0)); // vline0 = _mm_shuffle_epi32(vline0, _MM_SHUFFLE(3, 1, 2, 0)); // A1 A2 A3 A4 B1 B2 B3 B4 // vline1 = _mm_shuffle_epi32(vline1, _MM_SHUFFLE(3, 1, 2, 0)); // A5 A6 A7 A8 B5 B6 B7 B8 - // _mm_storeu_si128((__m128i *)first, _mm_unpacklo_epi64(vline0, vline1)); - // _mm_storeu_si128((__m128i *)second, _mm_unpackhi_epi64(vline0, vline1)); + // _mm_storeu_si128((__m128i *)line0, _mm_unpacklo_epi64(vline0, vline1)); + // _mm_storeu_si128((__m128i *)line1, _mm_unpackhi_epi64(vline0, vline1)); __m256i vline = _mm256_loadu_si256((__m256i *)sp); vline = _mm256_shuffle_epi8(vline, vshmask); vline = _mm256_permute4x64_epi64(vline, 0xD8); - _mm256_storeu2_m128i((__m128i *)second, (__m128i *)first, vline); - first += 8; - second += 8; + _mm256_storeu2_m128i((__m128i *)line1, (__m128i *)line0, vline); + line0 += 8; + line1 += 8; sp += 16; } for (; len > 0; --len) { - *first++ = *sp++; - *second++ = *sp++; + *line0++ = *sp++; + *line1++ = *sp++; } } } #else for (uint8_t b = 0; b < 4; ++b) { - for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { - for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; + for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = dp[b] + v * stride2[b]; + for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { + *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride]; } } } @@ -479,8 +486,9 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t void fdwt_2d_sr_fixed(sprec_t *previousLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH, const int32_t u0, const int32_t u1, const int32_t v0, const int32_t v1, const uint8_t transformation) { - sprec_t *src = previousLL; - fdwt_ver_sr_fixed[transformation](src, u0, u1, v0, v1); - fdwt_hor_sr_fixed(src, u0, u1, v0, v1, transformation); - fdwt_2d_deinterleave_fixed(src, LL, HL, LH, HH, u0, u1, v0, v1); + const int32_t stride = round_up(u1 - u0, 32); + sprec_t *src = previousLL; + fdwt_ver_sr_fixed[transformation](src, u0, u1, v0, v1, stride); + fdwt_hor_sr_fixed(src, u0, u1, v0, v1, transformation, stride); + fdwt_2d_deinterleave_fixed(src, LL, HL, LH, HH, u0, u1, v0, v1, stride); } \ No newline at end of file diff --git a/source/core/transform/fdwt_avx2.cpp b/source/core/transform/fdwt_avx2.cpp index b1e449a..cc12f04 100644 --- a/source/core/transform/fdwt_avx2.cpp +++ b/source/core/transform/fdwt_avx2.cpp @@ -338,8 +338,7 @@ auto fdwt_irrev97_fixed_avx2_ver_step3 = [](const int32_t simdlen, int16_t *cons }; void fdwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {4, 3}; constexpr int32_t num_pse_i1[2] = {3, 4}; const int32_t top = num_pse_i0[v0 % 2]; @@ -421,8 +420,7 @@ void fdwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u // reversible FDWT void fdwt_rev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {2, 1}; constexpr int32_t num_pse_i1[2] = {1, 2}; const int32_t top = num_pse_i0[v0 % 2]; diff --git a/source/core/transform/fdwt_neon.cpp b/source/core/transform/fdwt_neon.cpp index 8091b6b..033c759 100644 --- a/source/core/transform/fdwt_neon.cpp +++ b/source/core/transform/fdwt_neon.cpp @@ -261,8 +261,7 @@ auto fdwt_irrev97_fixed_neon_ver_step3 = [](const int32_t simdlen, int16_t *cons }; void fdwt_irrev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {4, 3}; constexpr int32_t num_pse_i1[2] = {3, 4}; const int32_t top = num_pse_i0[v0 % 2]; @@ -344,8 +343,7 @@ void fdwt_irrev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u // reversible FDWT void fdwt_rev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {2, 1}; constexpr int32_t num_pse_i1[2] = {1, 2}; const int32_t top = num_pse_i0[v0 % 2]; diff --git a/source/core/transform/idwt.cpp b/source/core/transform/idwt.cpp index 5eba173..a2161d9 100644 --- a/source/core/transform/idwt.cpp +++ b/source/core/transform/idwt.cpp @@ -102,8 +102,7 @@ static void idwt_1d_sr_fixed(sprec_t *buf, sprec_t *in, const int32_t left, cons } static void idwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1, const uint8_t transformation) { - const int32_t stride = u1 - u0; + const int32_t v1, const uint8_t transformation, const int32_t stride) { constexpr int32_t num_pse_i0[2][2] = {{3, 1}, {4, 2}}; constexpr int32_t num_pse_i1[2][2] = {{4, 2}, {3, 1}}; const int32_t left = num_pse_i0[u0 % 2][transformation]; @@ -114,14 +113,14 @@ static void idwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c for (int32_t row = 0; row < v1 - v0; ++row) { // in[row] = in[row]; if (u0 % 2 != 0 && transformation) { - in[row] = static_cast(in[row] >> 1); + in[row * stride] = static_cast(in[row * stride] >> 1); } } } else { // need to perform symmetric extension const int32_t len = u1 - u0 + left + right; auto *Yext = static_cast(aligned_mem_alloc( - sizeof(sprec_t) * static_cast(round_up(len + SIMD_PADDING, SIMD_PADDING)), 32)); + sizeof(sprec_t) * static_cast(round_up(len + SIMD_PADDING, SIMD_PADDING)), 32)); for (int32_t row = 0; row < v1 - v0; ++row) { idwt_1d_sr_fixed(Yext, in, left, right, u0, u1, transformation); in += stride; @@ -131,8 +130,7 @@ static void idwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c } void idwt_irrev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {3, 4}; constexpr int32_t num_pse_i1[2] = {4, 3}; const int32_t top = num_pse_i0[v0 % 2]; @@ -204,8 +202,7 @@ void idwt_irrev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, co } void idwt_rev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {1, 2}; constexpr int32_t num_pse_i1[2] = {2, 1}; const int32_t top = num_pse_i0[v0 % 2]; @@ -263,8 +260,7 @@ void idwt_rev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, cons } static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH, - int32_t u0, int32_t u1, int32_t v0, int32_t v1) { - const int32_t stride = u1 - u0; + int32_t u0, int32_t u1, int32_t v0, int32_t v1, const int32_t stride) { const int32_t v_offset = v0 % 2; const int32_t u_offset = u0 % 2; sprec_t *sp[4] = {LL, HL, LH, HH}; @@ -274,49 +270,50 @@ static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, spr const int32_t ustop[4] = {ceil_int(u1, 2), u1 / 2, ceil_int(u1, 2), u1 / 2}; const int32_t voffset[4] = {v_offset, v_offset, 1 - v_offset, 1 - v_offset}; const int32_t uoffset[4] = {u_offset, 1 - u_offset, u_offset, 1 - u_offset}; + const int32_t stride2[4] = {round_up(ustop[0] - ustart[0], 32), round_up(ustop[1] - ustart[1], 32), + round_up(ustop[2] - ustart[2], 32), round_up(ustop[3] - ustart[3], 32)}; #if defined(OPENHTJ2K_ENABLE_ARM_NEON) if ((ustop[0] - ustart[0]) != (ustop[1] - ustart[1])) { for (uint8_t b = 0; b < 2; ++b) { for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = sp[b] + v * stride2[b]; for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++); + buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++); } } } } else { sprec_t *first, *second; - first = LL; - second = HL; + first = sp[0]; + second = sp[1]; if (uoffset[0] > uoffset[1]) { - first = HL; - second = LL; + first = sp[1]; + second = sp[0]; } - openhtj2k_arm_prefetch(first); - openhtj2k_arm_prefetch(second); int16x8_t vfirst0, vfirst1, vsecond0, vsecond1; // int16x8x2_t vdst0, vdst1; for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) { - sprec_t *dp = buf + (2 * v + voffset[0]) * stride; - size_t len = static_cast(ustop[0] - ustart[0]); + sprec_t *dp = buf + (2 * v + voffset[0]) * stride; + size_t len = static_cast(ustop[0] - ustart[0]); + sprec_t *line0 = first + v * stride2[0]; + sprec_t *line1 = second + v * stride2[0]; for (; len >= 16; len -= 16) { - vfirst0 = vld1q_s16(first); - vsecond0 = vld1q_s16(second); + vfirst0 = vld1q_s16(line0); + vsecond0 = vld1q_s16(line1); vst1q_s16(dp, vzip1q_s16(vfirst0, vsecond0)); vst1q_s16(dp + 8, vzip2q_s16(vfirst0, vsecond0)); - vfirst1 = vld1q_s16(first + 8); - vsecond1 = vld1q_s16(second + 8); + vfirst1 = vld1q_s16(line0 + 8); + vsecond1 = vld1q_s16(line1 + 8); vst1q_s16(dp + 16, vzip1q_s16(vfirst1, vsecond1)); vst1q_s16(dp + 24, vzip2q_s16(vfirst1, vsecond1)); - first += 16; - second += 16; - openhtj2k_arm_prefetch(first); - openhtj2k_arm_prefetch(second); + line0 += 16; + line1 += 16; dp += 32; } for (; len > 0; --len) { - *dp++ = *first++; - *dp++ = *second++; + *dp++ = *line0++; + *dp++ = *line1++; } } } @@ -324,158 +321,164 @@ static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, spr if ((ustop[2] - ustart[2]) != (ustop[3] - ustart[3])) { for (uint8_t b = 2; b < 4; ++b) { for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = sp[b] + v * stride2[b]; for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++); + buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++); } } } } else { sprec_t *first, *second; - first = LH; - second = HH; + first = sp[2]; + second = sp[3]; if (uoffset[2] > uoffset[3]) { - first = HH; - second = LH; + first = sp[3]; + second = sp[2]; } - openhtj2k_arm_prefetch(first); - openhtj2k_arm_prefetch(second); int16x8_t vfirst0, vfirst1, vsecond0, vsecond1; // int16x8x2_t vdst0, vdst1; for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) { - sprec_t *dp = buf + (2 * v + voffset[2]) * stride; - size_t len = static_cast(ustop[2] - ustart[2]); + sprec_t *dp = buf + (2 * v + voffset[2]) * stride; + size_t len = static_cast(ustop[2] - ustart[2]); + sprec_t *line0 = first + v * stride2[2]; + sprec_t *line1 = second + v * stride2[2]; for (; len >= 16; len -= 16) { - vfirst0 = vld1q_s16(first); - vsecond0 = vld1q_s16(second); + vfirst0 = vld1q_s16(line0); + vsecond0 = vld1q_s16(line1); vst1q_s16(dp, vzip1q_s16(vfirst0, vsecond0)); vst1q_s16(dp + 8, vzip2q_s16(vfirst0, vsecond0)); - vfirst1 = vld1q_s16(first + 8); - vsecond1 = vld1q_s16(second + 8); + vfirst1 = vld1q_s16(line0 + 8); + vsecond1 = vld1q_s16(line1 + 8); vst1q_s16(dp + 16, vzip1q_s16(vfirst1, vsecond1)); vst1q_s16(dp + 24, vzip2q_s16(vfirst1, vsecond1)); - first += 16; - second += 16; - openhtj2k_arm_prefetch(first); - openhtj2k_arm_prefetch(second); + line0 += 16; + line1 += 16; dp += 32; } for (; len > 0; --len) { - *dp++ = *first++; - *dp++ = *second++; + *dp++ = *line0++; + *dp++ = *line1++; } } } #elif defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__) if ((ustop[0] - ustart[0]) != (ustop[1] - ustart[1])) { - for (uint8_t b = 0; b < 2; ++b) { - for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { - for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++); + for (uint8_t b = 0; b < 2; ++b) { + for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = sp[b] + v * stride2[b]; + for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { + buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++); } } } } else { - sprec_t *first, *second; - first = LL; - second = HL; - if (uoffset[0] > uoffset[1]) { - first = HL; - second = LL; - } - for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) { - sprec_t *dp = buf + (2 * v + voffset[0]) * stride; - size_t len = static_cast(ustop[0] - ustart[0]); - // SSE version - // for (; len >= 8; len -= 8) { - // auto vfirst = _mm_loadu_si128((__m128i *)first); - // auto vsecond = _mm_loadu_si128((__m128i *)second); - // auto vtmp0 = _mm_unpacklo_epi16(vfirst, vsecond); - // auto vtmp1 = _mm_unpackhi_epi16(vfirst, vsecond); - // _mm_storeu_si128((__m128i *)dp, vtmp0); - // _mm_storeu_si128((__m128i *)(dp + 8), vtmp1); - // first += 8; - // second += 8; - // dp += 16; - // } + sprec_t *first, *second; + first = sp[0]; + second = sp[1]; + if (uoffset[0] > uoffset[1]) { + first = sp[1]; + second = sp[0]; + } + for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) { + sprec_t *dp = buf + (2 * v + voffset[0]) * stride; + size_t len = static_cast(ustop[0] - ustart[0]); + sprec_t *line0 = first + v * stride2[0]; + sprec_t *line1 = second + v * stride2[0]; + // SSE version + // for (; len >= 8; len -= 8) { + // auto vfirst = _mm_loadu_si128((__m128i *)line0); + // auto vsecond = _mm_loadu_si128((__m128i *)line1); + // auto vtmp0 = _mm_unpacklo_epi16(vfirst, vsecond); + // auto vtmp1 = _mm_unpackhi_epi16(vfirst, vsecond); + // _mm_storeu_si128((__m128i *)dp, vtmp0); + // _mm_storeu_si128((__m128i *)(dp + 8), vtmp1); + // line0 += 8; + // line1 += 8; + // dp += 16; + // } - // AVX2 version - __m256i vfirst, vsecond; - for (; len >= 16; len -= 16) { - vfirst = _mm256_loadu_si256((__m256i *)first); - vsecond = _mm256_loadu_si256((__m256i *)second); - auto vtmp0 = _mm256_unpacklo_epi16(vfirst, vsecond); - auto vtmp1 = _mm256_unpackhi_epi16(vfirst, vsecond); + // AVX2 version + __m256i vfirst, vsecond; + for (; len >= 16; len -= 16) { + vfirst = _mm256_loadu_si256((__m256i *)line0); + vsecond = _mm256_loadu_si256((__m256i *)line1); + auto vtmp0 = _mm256_unpacklo_epi16(vfirst, vsecond); + auto vtmp1 = _mm256_unpackhi_epi16(vfirst, vsecond); - _mm256_storeu_si256((__m256i *)dp, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x20)); - _mm256_storeu_si256((__m256i *)dp + 1, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x31)); - first += 16; - second += 16; - dp += 32; + _mm256_storeu_si256((__m256i *)dp, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x20)); + _mm256_storeu_si256((__m256i *)dp + 1, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x31)); + line0 += 16; + line1 += 16; + dp += 32; } - for (; len > 0; --len) { - *dp++ = *first++; - *dp++ = *second++; + for (; len > 0; --len) { + *dp++ = *line0++; + *dp++ = *line1++; } } } if ((ustop[2] - ustart[2]) != (ustop[3] - ustart[3])) { - for (uint8_t b = 2; b < 4; ++b) { - for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { - for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++); + for (uint8_t b = 2; b < 4; ++b) { + for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = sp[b] + v * stride2[b]; + for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { + buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++); } } } } else { - sprec_t *first, *second; - first = LH; - second = HH; - if (uoffset[2] > uoffset[3]) { - first = HH; - second = LH; - } - for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) { - sprec_t *dp = buf + (2 * v + voffset[2]) * stride; - size_t len = static_cast(ustop[2] - ustart[2]); - // SSE version - // for (; len >= 8; len -= 8) { - // auto vfirst = _mm_loadu_si128((__m128i *)first); - // auto vsecond = _mm_loadu_si128((__m128i *)second); - // auto vtmp0 = _mm_unpacklo_epi16(vfirst, vsecond); - // auto vtmp1 = _mm_unpackhi_epi16(vfirst, vsecond); - // _mm_storeu_si128((__m128i *)dp, vtmp0); - // _mm_storeu_si128((__m128i *)(dp + 8), vtmp1); - // first += 8; - // second += 8; - // dp += 16; - // } + sprec_t *first, *second; + first = sp[2]; + second = sp[3]; + if (uoffset[2] > uoffset[3]) { + first = sp[3]; + second = sp[2]; + } + for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) { + sprec_t *dp = buf + (2 * v + voffset[2]) * stride; + size_t len = static_cast(ustop[2] - ustart[2]); + sprec_t *line0 = first + v * stride2[2]; + sprec_t *line1 = second + v * stride2[2]; + // SSE version + // for (; len >= 8; len -= 8) { + // auto vfirst = _mm_loadu_si128((__m128i *)line0); + // auto vsecond = _mm_loadu_si128((__m128i *)line1); + // auto vtmp0 = _mm_unpacklo_epi16(vfirst, vsecond); + // auto vtmp1 = _mm_unpackhi_epi16(vfirst, vsecond); + // _mm_storeu_si128((__m128i *)dp, vtmp0); + // _mm_storeu_si128((__m128i *)(dp + 8), vtmp1); + // line0 += 8; + // line1 += 8; + // dp += 16; + // } - // AVX2 version - __m256i vfirst, vsecond; - for (; len >= 16; len -= 16) { - vfirst = _mm256_loadu_si256((__m256i *)first); - vsecond = _mm256_loadu_si256((__m256i *)second); - auto vtmp0 = _mm256_unpacklo_epi16(vfirst, vsecond); - auto vtmp1 = _mm256_unpackhi_epi16(vfirst, vsecond); + // AVX2 version + __m256i vfirst, vsecond; + for (; len >= 16; len -= 16) { + vfirst = _mm256_loadu_si256((__m256i *)line0); + vsecond = _mm256_loadu_si256((__m256i *)line1); + auto vtmp0 = _mm256_unpacklo_epi16(vfirst, vsecond); + auto vtmp1 = _mm256_unpackhi_epi16(vfirst, vsecond); - _mm256_storeu_si256((__m256i *)dp, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x20)); - _mm256_storeu_si256((__m256i *)dp + 1, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x31)); - first += 16; - second += 16; - dp += 32; + _mm256_storeu_si256((__m256i *)dp, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x20)); + _mm256_storeu_si256((__m256i *)dp + 1, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x31)); + line0 += 16; + line1 += 16; + dp += 32; } - for (; len > 0; --len) { - *dp++ = *first++; - *dp++ = *second++; + for (; len > 0; --len) { + *dp++ = *line0++; + *dp++ = *line1++; } } } #else for (uint8_t b = 0; b < 4; ++b) { - for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { - for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { - buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++); + for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) { + sprec_t *line = sp[b] + v * stride2[b]; + for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) { + buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++); } } } @@ -485,11 +488,12 @@ static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, spr void idwt_2d_sr_fixed(sprec_t *nextLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH, const int32_t u0, const int32_t u1, const int32_t v0, const int32_t v1, const uint8_t transformation, uint8_t normalizing_upshift) { - const int32_t buf_length = (u1 - u0) * (v1 - v0); + const int32_t stride = round_up(u1 - u0, 32); + const int32_t buf_length = stride * (v1 - v0); sprec_t *src = nextLL; - idwt_2d_interleave_fixed(src, LL, HL, LH, HH, u0, u1, v0, v1); - idwt_hor_sr_fixed(src, u0, u1, v0, v1, transformation); - idwt_ver_sr_fixed[transformation](src, u0, u1, v0, v1); + idwt_2d_interleave_fixed(src, LL, HL, LH, HH, u0, u1, v0, v1, stride); + idwt_hor_sr_fixed(src, u0, u1, v0, v1, transformation, stride); + idwt_ver_sr_fixed[transformation](src, u0, u1, v0, v1, stride); // scaling for 16bit width fixed-point representation if (transformation != 1 && normalizing_upshift) { @@ -513,19 +517,19 @@ void idwt_2d_sr_fixed(sprec_t *nextLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sp } #elif defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__) for (; len >= 16; len -= 16) { - __m256i tmp0 = _mm256_load_si256((__m256i *)src); - __m256i tmp1 = _mm256_slli_epi16(tmp0, static_cast(normalizing_upshift)); - _mm256_store_si256((__m256i *)src, tmp1); - src += 16; + __m256i tmp0 = _mm256_load_si256((__m256i *)src); + __m256i tmp1 = _mm256_slli_epi16(tmp0, static_cast(normalizing_upshift)); + _mm256_store_si256((__m256i *)src, tmp1); + src += 16; } for (; len > 0; --len) { - // cast to unsigned to avoid undefined behavior + // cast to unsigned to avoid undefined behavior *src = static_cast(static_cast(*src) << normalizing_upshift); src++; } #else for (; len > 0; --len) { - // cast to unsigned to avoid undefined behavior + // cast to unsigned to avoid undefined behavior *src = static_cast(static_cast(*src) << normalizing_upshift); src++; } diff --git a/source/core/transform/idwt_avx2.cpp b/source/core/transform/idwt_avx2.cpp index c4a46cd..3ea6a91 100644 --- a/source/core/transform/idwt_avx2.cpp +++ b/source/core/transform/idwt_avx2.cpp @@ -258,8 +258,7 @@ auto idwt_irrev97_fixed_avx2_ver_step = [](const int32_t simdlen, int16_t *const }; void idwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {3, 4}; constexpr int32_t num_pse_i1[2] = {4, 3}; const int32_t top = num_pse_i0[v0 % 2]; @@ -337,8 +336,7 @@ void idwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u // reversible IDWT void idwt_rev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {1, 2}; constexpr int32_t num_pse_i1[2] = {2, 1}; const int32_t top = num_pse_i0[v0 % 2]; diff --git a/source/core/transform/idwt_neon.cpp b/source/core/transform/idwt_neon.cpp index d1891c3..7ef7305 100644 --- a/source/core/transform/idwt_neon.cpp +++ b/source/core/transform/idwt_neon.cpp @@ -257,8 +257,7 @@ auto idwt_irrev97_fixed_neon_ver_step3 = [](const int32_t simdlen, int16_t *cons }; void idwt_irrev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {3, 4}; constexpr int32_t num_pse_i1[2] = {4, 3}; const int32_t top = num_pse_i0[v0 % 2]; @@ -336,8 +335,7 @@ void idwt_irrev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u // reversible IDWT void idwt_rev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0, - const int32_t v1) { - const int32_t stride = u1 - u0; + const int32_t v1, const int32_t stride) { constexpr int32_t num_pse_i0[2] = {1, 2}; constexpr int32_t num_pse_i1[2] = {2, 1}; const int32_t top = num_pse_i0[v0 % 2];