Add stride access for DWT with AVX2

osamu620 · Nov 15, 2024 · 7e77349 · 7e77349
1 parent c8abe53
commit 7e77349
Show file tree

Hide file tree

Showing 12 changed files with 228 additions and 219 deletions.
diff --git a/source/apps/imgcmp/image_class.hpp b/source/apps/imgcmp/image_class.hpp
@@ -70,7 +70,7 @@ class image {
         bitDepth(0),
         isSigned(false),
         isBigendian(false),
-        data(nullptr) {};
+        data(nullptr){};
   // // destructor
   ~image() { delete[] data; }
 
@@ -110,9 +110,9 @@ class image {
 
   // parsing PNM/PGX header
   int read_pnmpgx(const char *name) {
-    constexpr char SP = ' ';
-    constexpr char LF = '\n';
-    constexpr char CR = 13;
+    constexpr char SP                  = ' ';
+    constexpr char LF                  = '\n';
+    [[maybe_unused]] constexpr char CR = 13;
 
     FILE *fp = fopen(name, "rb");
     if (fp == nullptr) {

diff --git a/source/core/coding/coding_units.cpp b/source/core/coding/coding_units.cpp
@@ -267,10 +267,10 @@ void j2k_codeblock::create_compressed_buffer(buf_chain *tile_buf, int32_t buf_li
  *******************************************************************************/
 j2k_precinct_subband::j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uint8_t R_b,
                                            uint8_t transformation, float stepsize, sprec_t *ibuf,
-                                           const element_siz &bp0, const element_siz &bp1,
-                                           const element_siz &p0, const element_siz &p1,
-                                           const uint32_t band_stride, const uint16_t &num_layers,
-                                           const element_siz &codeblock_size, const uint8_t &Cmodes)
+                                           const element_siz &bp0, const element_siz &p0,
+                                           const element_siz &p1, const uint32_t band_stride,
+                                           const uint16_t &num_layers, const element_siz &codeblock_size,
+                                           const uint8_t &Cmodes)
     : j2k_region(p0, p1),
       orientation(orientation),
       inclusion_info(nullptr),
@@ -1028,8 +1028,8 @@ j2k_precinct::j2k_precinct(const uint8_t &r, const uint32_t &idx, const element_
                              ceil_int(pos1.y - yob[subband[i]->orientation], sr));
     this->pband[i] = MAKE_UNIQUE<j2k_precinct_subband>(
         subband[i]->orientation, subband[i]->M_b, subband[i]->R_b, subband[i]->transformation,
-        subband[i]->delta, subband[i]->i_samples, subband[i]->pos0, subband[i]->pos1, pbpos0, pbpos1,
-        subband[i]->stride, num_layers, codeblock_size, Cmodes);
+        subband[i]->delta, subband[i]->i_samples, subband[i]->pos0, pbpos0, pbpos1, subband[i]->stride,
+        num_layers, codeblock_size, Cmodes);
   }
 }
 
@@ -1402,7 +1402,7 @@ void j2k_tile_component::init(j2k_main_header *hdr, j2k_tilepart_header *tphdr,
   const uint32_t aligned_stride =
       round_up((ceil_int(pos1.x, 1U << tile->reduce_NL) - ceil_int(pos0.x, 1U << tile->reduce_NL)), 32U);
   const auto height             = static_cast<uint32_t>(ceil_int(pos1.y, 1U << tile->reduce_NL)
-                                                        - ceil_int(pos0.y, 1U << tile->reduce_NL));
+                                            - ceil_int(pos0.y, 1U << tile->reduce_NL));
   const uint32_t num_bufsamples = aligned_stride * height;
   samples = static_cast<int32_t *>(aligned_mem_alloc(sizeof(int32_t) * num_bufsamples, 32));
 
@@ -2459,7 +2459,7 @@ void j2k_tile::decode() {
 #endif
             }
           }  // end of codeblock loop
-        }  // end of subbnad loop
+        }    // end of subbnad loop
 #ifdef OPENHTJ2K_THREAD
         for (auto &result : results) {
           result.get();
@@ -2507,7 +2507,7 @@ void j2k_tile::decode() {
 
     // copy samples in resolution buffer to that in tile component buffer
     uint32_t height = tc1.y - tc0.y;
-    uint32_t width  = round_up(tc1.x - tc0.x, 32);
+    uint32_t width  = round_up(tc1.x - tc0.x, 32U);
     uint32_t stride = round_up(width, 32U);
     // size_t num_samples = static_cast<size_t>(tc1.x - tc0.x) * (tc1.y - tc0.y);
 #if defined(OPENHTJ2K_ENABLE_ARM_NEON)
@@ -3013,7 +3013,7 @@ uint8_t *j2k_tile::encode() {
 #if defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__)
     for (uint32_t y = 0; y < height; ++y) {
       int32_t *sp             = src + y * stride;
-      sprec_t *dp             = cr->i_samples + y * (bottom_right.x - top_left.x);
+      sprec_t *dp             = cr->i_samples + y * stride;
       uint32_t num_tc_samples = bottom_right.x - top_left.x;
       for (; num_tc_samples >= 16; num_tc_samples -= 16) {
         __m256i v0 = _mm256_load_si256((__m256i *)sp);
@@ -3030,7 +3030,7 @@ uint8_t *j2k_tile::encode() {
 #elif defined(OPENHTJ2K_ENABLE_ARM_NEON)
     for (uint32_t y = 0; y < height; ++y) {
       int32_t *sp             = src + y * stride;
-      sprec_t *dp             = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32);
+      sprec_t *dp             = cr->i_samples + y * stride;
       uint32_t num_tc_samples = bottom_right.x - top_left.x;
       for (; num_tc_samples >= 8; num_tc_samples -= 8) {
         auto vsrc0 = vld1q_s32(sp);
@@ -3046,7 +3046,7 @@ uint8_t *j2k_tile::encode() {
 #else
       for (uint32_t y = 0; y < height; ++y) {
         int32_t *sp             = src + y * stride;
-        sprec_t *dp             = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32);
+        sprec_t *dp             = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32U);
         uint32_t num_tc_samples = bottom_right.x - top_left.x;
         for (; num_tc_samples > 0; --num_tc_samples) {
           *dp++ = static_cast<sprec_t>(*sp++);

diff --git a/source/core/coding/coding_units.hpp b/source/core/coding/coding_units.hpp
@@ -58,7 +58,7 @@ class j2k_region {
   // set bottom-right coordinate (exclusive)
   void set_pos1(element_siz in) { pos1 = in; }
   j2k_region() = default;
-  j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1), stride(round_up(pos1.x - pos0.x, 32)) {}
+  j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1), stride(round_up(pos1.x - pos0.x, 32U)) {}
 };
 
 /********************************************************************************
@@ -179,10 +179,9 @@ class j2k_precinct_subband : public j2k_region {
   uint32_t num_codeblock_x;
   uint32_t num_codeblock_y;
   j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uint8_t R_b, uint8_t transformation,
-                       float stepsize, sprec_t *ibuf, const element_siz &bp0, const element_siz &bp1,
-                       const element_siz &p0, const element_siz &p1, const uint32_t stride,
-                       const uint16_t &num_layers, const element_siz &codeblock_size,
-                       const uint8_t &Cmodes);
+                       float stepsize, sprec_t *ibuf, const element_siz &bp0, const element_siz &p0,
+                       const element_siz &p1, const uint32_t stride, const uint16_t &num_layers,
+                       const element_siz &codeblock_size, const uint8_t &Cmodes);
   ~j2k_precinct_subband() {
     delete inclusion_info;
     delete ZBP_info;
@@ -257,7 +256,7 @@ class j2c_packet {
   uint32_t length;
 
   j2c_packet()
-      : layer(0), resolution(0), component(0), precinct(0), header(nullptr), body(nullptr), length(0) {};
+      : layer(0), resolution(0), component(0), precinct(0), header(nullptr), body(nullptr), length(0){};
   // constructor for decoding
   j2c_packet(const uint16_t l, const uint8_t r, const uint16_t c, const uint32_t p,
              buf_chain *const h = nullptr, buf_chain *const bo = nullptr)
@@ -316,7 +315,7 @@ class j2k_resolution : public j2k_region {
   void scale();
   void destroy() {
     aligned_mem_free(i_samples);
-    for (auto b = 0; b < num_bands; ++b) {
+    for (uint8_t b = 0; b < num_bands; ++b) {
       if (subbands != nullptr) {
         subbands[b]->destroy();
       }
@@ -424,7 +423,7 @@ class j2k_tile_component : public j2k_tile_base {
   void perform_dc_offset(uint8_t transformation, bool is_signed);
 
   void destroy() {
-    for (auto r = 0; r < this->NL; ++r) {
+    for (uint8_t r = 0; r < this->NL; ++r) {
       if (resolution != nullptr) {
         auto p = resolution[r].get();
         if (p != nullptr) resolution[r]->destroy();
@@ -495,7 +494,7 @@ class j2k_tile : public j2k_tile_base {
  public:
   j2k_tile();
   void destroy() {
-    for (auto c = 0; c < this->num_components; ++c) {
+    for (uint16_t c = 0; c < this->num_components; ++c) {
       tcomp[c].destroy();
     }
   }

diff --git a/source/core/coding/ht_block_decoding.cpp b/source/core/coding/ht_block_decoding.cpp
@@ -265,7 +265,7 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
     for (uint32_t x = 0; x < block->size.x; sp += 2, ++vp) {
       uint32_t inf = sp[0];
       uint32_t U_q = sp[1];
-      if (U_q > ((30 - pLSB) + 2)) {
+      if (U_q > ((30U - pLSB) + 2U)) {
         printf("ERROR\n");
       }
 
@@ -279,14 +279,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         MagSgn.advance(m_n);                                // consume m_n
 
         val = ms_val << 31;                      // get sign bit
-        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
         v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
         v_n |= 1;                                // add center of bin
         // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
         // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
         val |= (v_n + 2) << (pLSB - 1);
       }
-      dp[0] = val;
+      dp[0] = static_cast<int32_t>(val);
 
       v_n = 0;
       val = 0;
@@ -298,14 +298,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         MagSgn.advance(m_n);                                // consume m_n
 
         val = ms_val << 31;                      // get sign bit
-        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
         v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
         v_n |= 1;                                // add center of bin
         // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
         // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
         val |= (v_n + 2) << (pLSB - 1);
       }
-      dp[block->blksampl_stride] = val;
+      dp[block->blksampl_stride] = static_cast<int32_t>(val);
       vp[0]                      = prev_v_n | v_n;
       prev_v_n                   = 0;
       ++dp;
@@ -323,14 +323,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         MagSgn.advance(m_n);                                // consume m_n
 
         val = ms_val << 31;                      // get sign bit
-        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
         v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
         v_n |= 1;                                // add center of bin
         // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
         // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
         val |= (v_n + 2) << (pLSB - 1);
       }
-      dp[0] = val;
+      dp[0] = static_cast<int32_t>(val);
 
       v_n = 0;
       val = 0;
@@ -342,22 +342,22 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         MagSgn.advance(m_n);                                // consume m_n
 
         val = ms_val << 31;                      // get sign bit
-        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
         v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
         v_n |= 1;                                // add center of bin
         // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
         // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
         val |= (v_n + 2) << (pLSB - 1);
       }
-      dp[block->blksampl_stride] = val;
+      dp[block->blksampl_stride] = static_cast<int32_t>(val);
       prev_v_n                   = v_n;
       ++dp;
       ++x;
     }
     vp[0] = prev_v_n;
 
     for (uint32_t y = 2; y < block->size.y; y += 2) {
-      uint16_t *sp = scratch + (y >> 1) * sstr;
+      uint16_t *sp = scratch + (y >> 1) * static_cast<uint32_t>(sstr);
       uint32_t *vp = v_n_scratch;
       int32_t *dp  = block->sample_buf + y * block->blksampl_stride;
 
@@ -373,7 +373,7 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         uint32_t kappa = gamma ? emax : 1;
 
         uint32_t U_q = u_q + kappa;
-        if (U_q > ((30 - pLSB) + 2)) {
+        if (U_q > ((30U - pLSB) + 2U)) {
           printf("ERROR\n");
         }
 
@@ -387,14 +387,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
           MagSgn.advance(m_n);                                // consume m_n
 
           val = ms_val << 31;                      // get sign bit
-          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
           v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
           v_n |= 1;                                // add center of bin
           // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
           // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
           val |= (v_n + 2) << (pLSB - 1);
         }
-        dp[0] = val;
+        dp[0] = static_cast<int32_t>(val);
 
         v_n = 0;
         val = 0;
@@ -406,14 +406,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
           MagSgn.advance(m_n);                                // consume m_n
 
           val = ms_val << 31;                      // get sign bit
-          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
           v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
           v_n |= 1;                                // add center of bin
           // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
           // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
           val |= (v_n + 2) << (pLSB - 1);
         }
-        dp[block->blksampl_stride] = val;
+        dp[block->blksampl_stride] = static_cast<int32_t>(val);
         vp[0]                      = prev_v_n | v_n;
         prev_v_n                   = 0;
         ++dp;
@@ -431,14 +431,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
           MagSgn.advance(m_n);                                // consume m_n
 
           val = ms_val << 31;                      // get sign bit
-          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
           v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
           v_n |= 1;                                // add center of bin
           // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
           // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
           val |= (v_n + 2) << (pLSB - 1);
         }
-        dp[0] = val;
+        dp[0] = static_cast<int32_t>(val);
 
         v_n = 0;
         val = 0;
@@ -450,14 +450,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
           MagSgn.advance(m_n);                                // consume m_n
 
           val = ms_val << 31;                      // get sign bit
-          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
           v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
           v_n |= 1;                                // add center of bin
           // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
           // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
           val |= (v_n + 2) << (pLSB - 1);
         }
-        dp[block->blksampl_stride] = val;
+        dp[block->blksampl_stride] = static_cast<int32_t>(val);
         prev_v_n                   = v_n;
         ++dp;
         ++x;

diff --git a/source/core/coding/ht_block_decoding.hpp b/source/core/coding/ht_block_decoding.hpp
@@ -866,7 +866,7 @@ class fwd_buf {
    */
   FORCE_INLINE void advance(uint32_t num_bits) {
     // if (!num_bits) return;
-    if (!(num_bits >= 0 && num_bits <= this->bits && num_bits < 128)) {
+    if (!(num_bits <= this->bits && num_bits < 128)) {
       printf("Value of numbits = %d is out of range.\n", num_bits);
       throw std::exception();
     }

diff --git a/source/core/coding/ht_block_encoding_avx2.hpp b/source/core/coding/ht_block_encoding_avx2.hpp
@@ -85,22 +85,22 @@ class state_MS_enc {
     uint32_t t = 0;
 
     // _bzhi_u32(UINT32_MAX, len) = ((1U << len) - 1U)
-    tmp = val & ((1 << (8 - stuff)) - 1);
+    tmp = val & ((1U << (8U - stuff)) - 1U);
     t |= tmp;
     bits_local += 8 - stuff;
     stuff = (tmp == 0xFF);
 
-    tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1);
+    tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U);
     t |= tmp << 8;
     bits_local += 8 - stuff;
     stuff = (tmp == 0xFF);
 
-    tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1);
+    tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U);
     t |= tmp << 16;
     bits_local += 8 - stuff;
     stuff = (tmp == 0xFF);
 
-    tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1);
+    tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U);
     t |= tmp << 24;
     bits_local += 8 - stuff;
     last = tmp & 0xFF;
@@ -168,7 +168,7 @@ class state_MS_enc {
     }
 #else
     for (int i = 0; i < 4; ++i) {
-      Creg |= static_cast<__uint128_t>(_mm_cvtsi128_si32(v)) << ctreg;
+      Creg |= static_cast<uint64_t>(static_cast<__uint128_t>(_mm_cvtsi128_si32(v)) << ctreg);
       ctreg += static_cast<uint32_t>(_mm_cvtsi128_si32(m));
       v = _mm_srli_si128(v, 4);
       m = _mm_srli_si128(m, 4);

diff --git a/source/core/transform/color.cpp b/source/core/transform/color.cpp
@@ -26,7 +26,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#if !defined(OPENHTJ2K_TRY_AVX2) && !defined(__AVX2__) && !defined(OPENHTJ2K_ENABLE_ARM_NEON)
+#if not defined(OPENHTJ2K_TRY_AVX2) && not defined(OPENHTJ2K_ENABLE_ARM_NEON)
   #include "color.hpp"
 
 void cvt_rgb_to_ycbcr_rev(int32_t *sp0, int32_t *sp1, int32_t *sp2, uint32_t width, uint32_t height) {