From 9526745840fc6755352de623f2b821390b4f4441 Mon Sep 17 00:00:00 2001
From: OSAMU WATANABE <owatanab@es.takushoku-u.ac.jp>
Date: Fri, 9 Aug 2024 15:56:55 +0900
Subject: [PATCH 1/6] Fix PGX parsing problem

---
 source/apps/imgcmp/image_class.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/apps/imgcmp/image_class.hpp b/source/apps/imgcmp/image_class.hpp
index c4e71ea..f80150d 100644
--- a/source/apps/imgcmp/image_class.hpp
+++ b/source/apps/imgcmp/image_class.hpp
@@ -223,7 +223,7 @@ class image {
         }
       }
       // read numerical value
-      while (c != SP && c != LF && c != CR) {
+      while (c >= '0' && c <= '9') {
         val *= 10;
         val += c - '0';
         c = fgetc(fp);
@@ -255,8 +255,10 @@ class image {
       }
     }
     // easting trailing spaces/LF/CR or comments
-    c = fgetc(fp);
+    c         = fgetc(fp);
+    int count = 0;
     while (c == SP || c == LF) {
+      count++;
       c = fgetc(fp);
       if (c == '#') {
         char *nouse = fgets(comment, sizeof(comment), fp);
@@ -267,7 +269,7 @@ class image {
         c = fgetc(fp);
       }
     }
-    fseek(fp, -1, SEEK_CUR);
+    fseek(fp, -count - 1, SEEK_CUR);
 
     const uint_fast8_t nbytes = static_cast<uint_fast8_t>((bitDepth + 7) / 8);  // ceil bitDepth to byte
     const size_t num_samples  = this->width * this->height * num_components;

From e366d28a061a037d7de28e1f6426e4a7ab495e63 Mon Sep 17 00:00:00 2001
From: OSAMU WATANABE <owatanab@es.takushoku-u.ac.jp>
Date: Fri, 9 Aug 2024 15:57:22 +0900
Subject: [PATCH 2/6] Add alternative decoding logic

---
 source/core/coding/ht_block_decoding.cpp | 416 +++++++++++++++++++++++
 1 file changed, 416 insertions(+)

diff --git a/source/core/coding/ht_block_decoding.cpp b/source/core/coding/ht_block_decoding.cpp
index ec2373c..becde89 100644
--- a/source/core/coding/ht_block_decoding.cpp
+++ b/source/core/coding/ht_block_decoding.cpp
@@ -53,6 +53,422 @@ uint8_t j2k_codeblock::calc_mbr(const uint32_t i, const uint32_t j, const uint8_
 
 void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t Lcup, const int32_t Pcup,
                        const int32_t Scup) {
+  uint8_t *compressed_data = block->get_compressed_data();
+  const uint16_t QW        = static_cast<uint16_t>(ceil_int(static_cast<int16_t>(block->size.x), 2));
+  const uint16_t QH        = static_cast<uint16_t>(ceil_int(static_cast<int16_t>(block->size.y), 2));
+
+  uint16_t scratch[8 * 513] = {0};
+  int32_t sstr              = static_cast<int32_t>(((block->size.x + 2) + 7u) & ~7u);  // multiples of 8
+  uint16_t *sp;
+  int32_t qx;
+  /*******************************************************************************************************************/
+  // VLC, UVLC and MEL decoding
+  /*******************************************************************************************************************/
+  MEL_dec MEL(compressed_data, Lcup, Scup);
+  rev_buf VLC_dec(compressed_data, Lcup, Scup);
+  auto sp0 = block->block_states + 1 + block->blkstate_stride;
+  auto sp1 = block->block_states + 1 + 2 * block->blkstate_stride;
+  uint32_t u_off0, u_off1;
+  uint32_t u0, u1;
+  uint32_t context = 0;
+  uint32_t vlcval;
+
+  const uint16_t *dec_table;
+  // Initial line-pair
+  dec_table       = dec_CxtVLC_table0_fast_16;
+  sp              = scratch;
+  int32_t mel_run = MEL.get_run();
+  for (qx = QW; qx > 0; qx -= 2, sp += 4) {
+    // Decoding of significance and EMB patterns and unsigned residual offsets
+    vlcval       = VLC_dec.fetch();
+    uint16_t tv0 = dec_table[(vlcval & 0x7F) + context];
+    if (context == 0) {
+      mel_run -= 2;
+      tv0 = (mel_run == -1) ? tv0 : 0;
+      if (mel_run < 0) {
+        mel_run = MEL.get_run();
+      }
+    }
+    sp[0] = tv0;
+
+    // calculate context for the next quad, Eq. (1) in the spec
+    context = ((tv0 & 0xE0U) << 2) | ((tv0 & 0x10U) << 3);  // = context << 7
+
+    // Decoding of significance and EMB patterns and unsigned residual offsets
+    vlcval       = VLC_dec.advance((tv0 & 0x000F) >> 1);
+    uint16_t tv1 = dec_table[(vlcval & 0x7F) + context];
+    if (context == 0 && qx > 1) {
+      mel_run -= 2;
+      tv1 = (mel_run == -1) ? tv1 : 0;
+      if (mel_run < 0) {
+        mel_run = MEL.get_run();
+      }
+    }
+    tv1   = (qx > 1) ? tv1 : 0;
+    sp[2] = tv1;
+
+    // store sigma
+    *sp0++ = ((tv0 >> 4) >> 0) & 1;
+    *sp0++ = ((tv0 >> 4) >> 2) & 1;
+    *sp0++ = ((tv1 >> 4) >> 0) & 1;
+    *sp0++ = ((tv1 >> 4) >> 2) & 1;
+    *sp1++ = ((tv0 >> 4) >> 1) & 1;
+    *sp1++ = ((tv0 >> 4) >> 3) & 1;
+    *sp1++ = ((tv1 >> 4) >> 1) & 1;
+    *sp1++ = ((tv1 >> 4) >> 3) & 1;
+
+    // calculate context for the next quad, Eq. (1) in the spec
+    context = ((tv1 & 0xE0U) << 2) | ((tv1 & 0x10U) << 3);  // = context << 7
+
+    vlcval = VLC_dec.advance((tv1 & 0x000F) >> 1);
+    u_off0 = tv0 & 1;
+    u_off1 = tv1 & 1;
+
+    uint32_t mel_offset = 0;
+    if (u_off0 == 1 && u_off1 == 1) {
+      mel_run -= 2;
+      mel_offset = (mel_run == -1) ? 0x40 : 0;
+      if (mel_run < 0) {
+        mel_run = MEL.get_run();
+      }
+    }
+
+    // UVLC decoding
+    uint32_t idx         = (vlcval & 0x3F) + (u_off0 << 6U) + (u_off1 << 7U) + mel_offset;
+    uint32_t uvlc_result = uvlc_dec_0[idx];
+    // remove total prefix length
+    vlcval = VLC_dec.advance(uvlc_result & 0x7);
+    uvlc_result >>= 3;
+    // extract suffixes for quad 0 and 1
+    uint32_t len = uvlc_result & 0xF;  // suffix length for 2 quads (up to 10 = 5 + 5)
+    //  ((1U << len) - 1U) can be replaced with _bzhi_u32(UINT32_MAX, len); not fast
+    uint32_t tmp = vlcval & ((1U << len) - 1U);  // suffix value for 2 quads
+    vlcval       = VLC_dec.advance(len);
+    uvlc_result >>= 4;
+    // quad 0 length
+    len = uvlc_result & 0x7;  // quad 0 suffix length
+    uvlc_result >>= 3;
+    // U = 1+ u
+    u0 = 1 + (uvlc_result & 7) + (tmp & ~(0xFFU << len));  // always kappa = 1 in initial line pair
+    u1 = 1 + (uvlc_result >> 3) + (tmp >> len);            // always kappa = 1 in initial line pair
+
+    sp[1] = static_cast<uint16_t>(u0);
+    sp[3] = static_cast<uint16_t>(u1);
+  }
+  // sp[0] = sp[1] = 0;
+
+  // Non-initial line-pair
+  dec_table = dec_CxtVLC_table1_fast_16;
+  for (uint16_t row = 1; row < QH; row++) {
+    sp0 = block->block_states + (row * 2U + 1U) * block->blkstate_stride + 1U;
+    sp1 = sp0 + block->blkstate_stride;
+
+    sp = scratch + row * sstr;
+    // calculate context for the next quad: w, sw, nw are always 0 at the head of a row
+    context = ((sp[0 - sstr] & 0xA0U) << 2) | ((sp[2 - sstr] & 0x20U) << 4);
+    for (qx = QW; qx > 0; qx -= 2, sp += 4) {
+      // Decoding of significance and EMB patterns and unsigned residual offsets
+      vlcval       = VLC_dec.fetch();
+      uint16_t tv0 = dec_table[(vlcval & 0x7F) + context];
+      if (context == 0) {
+        mel_run -= 2;
+        tv0 = (mel_run == -1) ? tv0 : 0;
+        if (mel_run < 0) {
+          mel_run = MEL.get_run();
+        }
+      }
+      // calculate context for the next quad, Eq. (2) in the spec
+      context = ((tv0 & 0x40U) << 2) | ((tv0 & 0x80U) << 1);              // (w | sw) << 8
+      context |= (sp[0 - sstr] & 0x80U) | ((sp[2 - sstr] & 0xA0U) << 2);  // ((nw | n) << 7) | (ne << 9)
+      context |= (sp[4 - sstr] & 0x20U) << 4;                             // ( nf) << 9
+
+      sp[0] = tv0;
+
+      vlcval = VLC_dec.advance((tv0 & 0x000F) >> 1);
+
+      // Decoding of significance and EMB patterns and unsigned residual offsets
+      uint16_t tv1 = dec_table[(vlcval & 0x7F) + context];
+      if (context == 0 && qx > 1) {
+        mel_run -= 2;
+        tv1 = (mel_run == -1) ? tv1 : 0;
+        if (mel_run < 0) {
+          mel_run = MEL.get_run();
+        }
+      }
+      tv1 = (qx > 1) ? tv1 : 0;
+      // calculate context for the next quad, Eq. (2) in the spec
+      context = ((tv1 & 0x40U) << 2) | ((tv1 & 0x80U) << 1);              // (w | sw) << 8
+      context |= (sp[2 - sstr] & 0x80U) | ((sp[4 - sstr] & 0xA0U) << 2);  // ((nw | n) << 7) | (ne << 9)
+      context |= (sp[6 - sstr] & 0x20U) << 4;                             // ( nf) << 9
+
+      sp[2] = tv1;
+
+      // store sigma
+      *sp0++ = ((tv0 >> 4) >> 0) & 1;
+      *sp0++ = ((tv0 >> 4) >> 2) & 1;
+      *sp0++ = ((tv1 >> 4) >> 0) & 1;
+      *sp0++ = ((tv1 >> 4) >> 2) & 1;
+      *sp1++ = ((tv0 >> 4) >> 1) & 1;
+      *sp1++ = ((tv0 >> 4) >> 3) & 1;
+      *sp1++ = ((tv1 >> 4) >> 1) & 1;
+      *sp1++ = ((tv1 >> 4) >> 3) & 1;
+
+      vlcval = VLC_dec.advance((tv1 & 0x000F) >> 1);
+
+      // UVLC decoding
+      u_off0       = tv0 & 1;
+      u_off1       = tv1 & 1;
+      uint32_t idx = (vlcval & 0x3F) + (u_off0 << 6U) + (u_off1 << 7U);
+
+      uint32_t uvlc_result = uvlc_dec_1[idx];
+      // remove total prefix length
+      vlcval = VLC_dec.advance(uvlc_result & 0x7);
+      uvlc_result >>= 3;
+      // extract suffixes for quad 0 and 1
+      uint32_t len = uvlc_result & 0xF;  // suffix length for 2 quads (up to 10 = 5 + 5)
+      //  ((1U << len) - 1U) can be replaced with _bzhi_u32(UINT32_MAX, len); not fast
+      uint32_t tmp = vlcval & ((1U << len) - 1U);  // suffix value for 2 quads
+      vlcval       = VLC_dec.advance(len);
+      uvlc_result >>= 4;
+      // quad 0 length
+      len = uvlc_result & 0x7;  // quad 0 suffix length
+      uvlc_result >>= 3;
+      u0 = (uvlc_result & 7) + (tmp & ~(0xFFU << len));
+      u1 = (uvlc_result >> 3) + (tmp >> len);
+
+      sp[1] = static_cast<uint16_t>(u0);
+      sp[3] = static_cast<uint16_t>(u1);
+    }
+    // sp[0] = sp[1] = 0;
+  }
+
+  /*******************************************************************************************************************/
+  // MagSgn decoding
+  /*******************************************************************************************************************/
+  {
+    // We allocate a scratch row for storing v_n values.
+    // We have 512 quads horizontally.
+    // We need an extra entry to handle the case of vp[1]
+    // when vp is at the last column.
+    // Here, we allocate 4 instead of 1 to make the buffer size
+    // a multipled of 16 bytes.
+    const int v_n_size             = 512 + 4;
+    uint32_t v_n_scratch[v_n_size] = {0};  // 2+ kB
+
+    fwd_buf<0xFF> MagSgn(compressed_data, Pcup);
+
+    uint16_t *sp = scratch;
+    uint32_t *vp = v_n_scratch;
+    int32_t *dp  = block->sample_buf;
+
+    uint32_t prev_v_n = 0;
+    for (uint32_t x = 0; x < block->size.x; sp += 2, ++vp) {
+      uint32_t inf = sp[0];
+      uint32_t U_q = sp[1];
+      if (U_q > ((30 - pLSB) + 2)) {
+        printf("ERROR\n");
+      }
+
+      uint32_t v_n;
+      uint32_t val = 0;
+      uint32_t bit = 0;
+      if (inf & (1 << (4 + bit))) {
+        // get 32 bits of magsgn data
+        uint32_t ms_val = MagSgn.fetch();
+        uint32_t m_n    = U_q - ((inf >> (12 + bit)) & 1);  // remove e_k
+        MagSgn.advance(m_n);                                // consume m_n
+
+        val = ms_val << 31;                      // get sign bit
+        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
+        v_n |= 1;                                // add center of bin
+        // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+        // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+        val |= (v_n + 2) << (pLSB - 1);
+      }
+      dp[0] = val;
+
+      v_n = 0;
+      val = 0;
+      bit = 1;
+      if (inf & (1 << (4 + bit))) {
+        // get 32 bits of magsgn data
+        uint32_t ms_val = MagSgn.fetch();
+        uint32_t m_n    = U_q - ((inf >> (12 + bit)) & 1);  // remove e_k
+        MagSgn.advance(m_n);                                // consume m_n
+
+        val = ms_val << 31;                      // get sign bit
+        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
+        v_n |= 1;                                // add center of bin
+        // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+        // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+        val |= (v_n + 2) << (pLSB - 1);
+      }
+      dp[block->blksampl_stride] = val;
+      vp[0]                      = prev_v_n | v_n;
+      prev_v_n                   = 0;
+      ++dp;
+      if (++x >= block->size.x) {
+        ++vp;
+        break;
+      }
+
+      val = 0;
+      bit = 2;
+      if (inf & (1 << (4 + bit))) {
+        // get 32 bits of magsgn data
+        uint32_t ms_val = MagSgn.fetch();
+        uint32_t m_n    = U_q - ((inf >> (12 + bit)) & 1);  // remove e_k
+        MagSgn.advance(m_n);                                // consume m_n
+
+        val = ms_val << 31;                      // get sign bit
+        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
+        v_n |= 1;                                // add center of bin
+        // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+        // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+        val |= (v_n + 2) << (pLSB - 1);
+      }
+      dp[0] = val;
+
+      v_n = 0;
+      val = 0;
+      bit = 3;
+      if (inf & (1 << (4 + bit))) {
+        // get 32 bits of magsgn data
+        uint32_t ms_val = MagSgn.fetch();
+        uint32_t m_n    = U_q - ((inf >> (12 + bit)) & 1);  // remove e_k
+        MagSgn.advance(m_n);                                // consume m_n
+
+        val = ms_val << 31;                      // get sign bit
+        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
+        v_n |= 1;                                // add center of bin
+        // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+        // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+        val |= (v_n + 2) << (pLSB - 1);
+      }
+      dp[block->blksampl_stride] = val;
+      prev_v_n                   = v_n;
+      ++dp;
+      ++x;
+    }
+    vp[0] = prev_v_n;
+
+    for (uint32_t y = 2; y < block->size.y; y += 2) {
+      uint16_t *sp = scratch + (y >> 1) * sstr;
+      uint32_t *vp = v_n_scratch;
+      int32_t *dp  = block->sample_buf + y * block->blksampl_stride;
+
+      prev_v_n = 0;
+      for (uint32_t x = 0; x < block->size.x; sp += 2, ++vp) {
+        uint32_t inf = sp[0];
+        uint32_t u_q = sp[1];
+
+        uint32_t gamma = inf & 0xF0;
+        gamma &= gamma - 0x10;  // is gamma_q 1?
+        uint32_t emax  = vp[0] | vp[1];
+        emax           = 31 - count_leading_zeros(emax | 2);  // emax - 1
+        uint32_t kappa = gamma ? emax : 1;
+
+        uint32_t U_q = u_q + kappa;
+        if (U_q > ((30 - pLSB) + 2)) {
+          printf("ERROR\n");
+        }
+
+        uint32_t v_n;
+        uint32_t val = 0;
+        uint32_t bit = 0;
+        if (inf & (1 << (4 + bit))) {
+          // get 32 bits of magsgn data
+          uint32_t ms_val = MagSgn.fetch();
+          uint32_t m_n    = U_q - ((inf >> (12 + bit)) & 1);  // remove e_k
+          MagSgn.advance(m_n);                                // consume m_n
+
+          val = ms_val << 31;                      // get sign bit
+          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
+          v_n |= 1;                                // add center of bin
+          // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+          // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+          val |= (v_n + 2) << (pLSB - 1);
+        }
+        dp[0] = val;
+
+        v_n = 0;
+        val = 0;
+        bit = 1;
+        if (inf & (1 << (4 + bit))) {
+          // get 32 bits of magsgn data
+          uint32_t ms_val = MagSgn.fetch();
+          uint32_t m_n    = U_q - ((inf >> (12 + bit)) & 1);  // remove e_k
+          MagSgn.advance(m_n);                                // consume m_n
+
+          val = ms_val << 31;                      // get sign bit
+          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
+          v_n |= 1;                                // add center of bin
+          // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+          // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+          val |= (v_n + 2) << (pLSB - 1);
+        }
+        dp[block->blksampl_stride] = val;
+        vp[0]                      = prev_v_n | v_n;
+        prev_v_n                   = 0;
+        ++dp;
+        if (++x >= block->size.x) {
+          ++vp;
+          break;
+        }
+
+        val = 0;
+        bit = 2;
+        if (inf & (1 << (4 + bit))) {
+          // get 32 bits of magsgn data
+          uint32_t ms_val = MagSgn.fetch();
+          uint32_t m_n    = U_q - ((inf >> (12 + bit)) & 1);  // remove e_k
+          MagSgn.advance(m_n);                                // consume m_n
+
+          val = ms_val << 31;                      // get sign bit
+          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
+          v_n |= 1;                                // add center of bin
+          // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+          // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+          val |= (v_n + 2) << (pLSB - 1);
+        }
+        dp[0] = val;
+
+        v_n = 0;
+        val = 0;
+        bit = 3;
+        if (inf & (1 << (4 + bit))) {
+          // get 32 bits of magsgn data
+          uint32_t ms_val = MagSgn.fetch();
+          uint32_t m_n    = U_q - ((inf >> (12 + bit)) & 1);  // remove e_k
+          MagSgn.advance(m_n);                                // consume m_n
+
+          val = ms_val << 31;                      // get sign bit
+          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
+          v_n |= 1;                                // add center of bin
+          // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+          // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+          val |= (v_n + 2) << (pLSB - 1);
+        }
+        dp[block->blksampl_stride] = val;
+        prev_v_n                   = v_n;
+        ++dp;
+        ++x;
+      }
+      vp[0] = prev_v_n;
+    }
+  }
+}
+
+void ht_cleanup_decode2(j2k_codeblock *block, const uint8_t &pLSB, const int32_t Lcup, const int32_t Pcup,
+                        const int32_t Scup) {
   fwd_buf<0xFF> MagSgn(block->get_compressed_data(), Pcup);
   MEL_dec MEL(block->get_compressed_data(), Lcup, Scup);
   rev_buf VLC_dec(block->get_compressed_data(), Lcup, Scup);

From f0fb96c3f78954b1016872ae81aa95fb59505c33 Mon Sep 17 00:00:00 2001
From: OSAMU WATANABE <owatanab@es.takushoku-u.ac.jp>
Date: Fri, 9 Aug 2024 15:58:12 +0900
Subject: [PATCH 3/6] Add stride access for DWT with non-SIMD and ARM NEON

---
 source/core/coding/coding_units.cpp |  25 ++++---
 source/core/coding/coding_units.hpp |  12 ++--
 source/core/transform/dwt.hpp       |  16 +++--
 source/core/transform/fdwt.cpp      |  92 +++++++++++++------------
 source/core/transform/fdwt_neon.cpp |   6 +-
 source/core/transform/idwt.cpp      | 102 ++++++++++++++--------------
 source/core/transform/idwt_neon.cpp |   6 +-
 7 files changed, 133 insertions(+), 126 deletions(-)

diff --git a/source/core/coding/coding_units.cpp b/source/core/coding/coding_units.cpp
index c08a108..c707483 100644
--- a/source/core/coding/coding_units.cpp
+++ b/source/core/coding/coding_units.cpp
@@ -269,8 +269,8 @@ j2k_precinct_subband::j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uin
                                            uint8_t transformation, float stepsize, sprec_t *ibuf,
                                            const element_siz &bp0, const element_siz &bp1,
                                            const element_siz &p0, const element_siz &p1,
-                                           const uint16_t &num_layers, const element_siz &codeblock_size,
-                                           const uint8_t &Cmodes)
+                                           const uint32_t band_stride, const uint16_t &num_layers,
+                                           const element_siz &codeblock_size, const uint8_t &Cmodes)
     : j2k_region(p0, p1),
       orientation(orientation),
       inclusion_info(nullptr),
@@ -290,7 +290,7 @@ j2k_precinct_subband::j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uin
   }
 
   const uint32_t num_codeblocks = this->num_codeblock_x * this->num_codeblock_y;
-  const uint32_t band_stride    = bp1.x - bp0.x;
+  //  const uint32_t band_stride    = stride;
   if (num_codeblocks != 0) {
     inclusion_info = new tagtree(this->num_codeblock_x, this->num_codeblock_y);
     ZBP_info       = new tagtree(this->num_codeblock_x, this->num_codeblock_y);
@@ -1029,7 +1029,7 @@ j2k_precinct::j2k_precinct(const uint8_t &r, const uint32_t &idx, const element_
     this->pband[i] = MAKE_UNIQUE<j2k_precinct_subband>(
         subband[i]->orientation, subband[i]->M_b, subband[i]->R_b, subband[i]->transformation,
         subband[i]->delta, subband[i]->i_samples, subband[i]->pos0, subband[i]->pos1, pbpos0, pbpos1,
-        num_layers, codeblock_size, Cmodes);
+        subband[i]->stride, num_layers, codeblock_size, Cmodes);
   }
 }
 
@@ -1059,8 +1059,9 @@ j2k_subband::j2k_subband(element_siz p0, element_siz p1, uint8_t orientation, ui
   if (num_samples) {
     if (orientation != BAND_LL) {
       // If not the lowest resolution, buffers for subbands shall be created.
-      i_samples = static_cast<sprec_t *>(aligned_mem_alloc(sizeof(sprec_t) * num_samples, 32));
-      memset(i_samples, 0, sizeof(sprec_t) * num_samples);
+      i_samples =
+          static_cast<sprec_t *>(aligned_mem_alloc(sizeof(sprec_t) * this->stride * (pos1.y - pos0.y), 32));
+      memset(i_samples, 0, sizeof(sprec_t) * this->stride * (pos1.y - pos0.y));
     } else {
       i_samples = ibuf;
     }
@@ -1096,10 +1097,12 @@ j2k_resolution::j2k_resolution(const uint8_t &r, const element_siz &p0, const el
   i_samples = nullptr;
   if (!is_empty) {
     if (index == 0) {
-      i_samples = static_cast<sprec_t *>(aligned_mem_alloc(sizeof(sprec_t) * num_samples, 32));
+      i_samples =
+          static_cast<sprec_t *>(aligned_mem_alloc(sizeof(sprec_t) * this->stride * (pos1.y - pos0.y), 32));
       memset(i_samples, 0, sizeof(sprec_t) * num_samples);
     } else {
-      i_samples = static_cast<sprec_t *>(aligned_mem_alloc(sizeof(sprec_t) * num_samples, 32));
+      i_samples =
+          static_cast<sprec_t *>(aligned_mem_alloc(sizeof(sprec_t) * this->stride * (pos1.y - pos0.y), 32));
     }
   }
 }
@@ -2504,7 +2507,7 @@ void j2k_tile::decode() {
 
     // copy samples in resolution buffer to that in tile component buffer
     uint32_t height = tc1.y - tc0.y;
-    uint32_t width  = tc1.x - tc0.x;
+    uint32_t width  = round_up(tc1.x - tc0.x, 32);
     uint32_t stride = round_up(width, 32U);
     // size_t num_samples = static_cast<size_t>(tc1.x - tc0.x) * (tc1.y - tc0.y);
 #if defined(OPENHTJ2K_ENABLE_ARM_NEON)
@@ -3027,7 +3030,7 @@ uint8_t *j2k_tile::encode() {
 #elif defined(OPENHTJ2K_ENABLE_ARM_NEON)
     for (uint32_t y = 0; y < height; ++y) {
       int32_t *sp             = src + y * stride;
-      sprec_t *dp             = cr->i_samples + y * (bottom_right.x - top_left.x);
+      sprec_t *dp             = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32);
       uint32_t num_tc_samples = bottom_right.x - top_left.x;
       for (; num_tc_samples >= 8; num_tc_samples -= 8) {
         auto vsrc0 = vld1q_s32(sp);
@@ -3043,7 +3046,7 @@ uint8_t *j2k_tile::encode() {
 #else
       for (uint32_t y = 0; y < height; ++y) {
         int32_t *sp             = src + y * stride;
-        sprec_t *dp             = cr->i_samples + y * (bottom_right.x - top_left.x);
+        sprec_t *dp             = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32);
         uint32_t num_tc_samples = bottom_right.x - top_left.x;
         for (; num_tc_samples > 0; --num_tc_samples) {
           *dp++ = static_cast<sprec_t>(*sp++);
diff --git a/source/core/coding/coding_units.hpp b/source/core/coding/coding_units.hpp
index 6aa5e5d..d417798 100644
--- a/source/core/coding/coding_units.hpp
+++ b/source/core/coding/coding_units.hpp
@@ -41,6 +41,9 @@ class j2k_region {
   element_siz pos0;
   // bottom-right coordinate (exclusive) of a region in the reference grid
   element_siz pos1;
+  // width for line buffer
+  uint32_t stride;
+
   // return top-left coordinate (inclusive)
   [[nodiscard]] element_siz get_pos0() const { return pos0; }
   // return bottom-right coordinate (exclusive)
@@ -55,7 +58,7 @@ class j2k_region {
   // set bottom-right coordinate (exclusive)
   void set_pos1(element_siz in) { pos1 = in; }
   j2k_region() = default;
-  j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1) {}
+  j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1), stride(round_up(pos1.x - pos0.x, 32)) {}
 };
 
 /********************************************************************************
@@ -177,8 +180,9 @@ class j2k_precinct_subband : public j2k_region {
   uint32_t num_codeblock_y;
   j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uint8_t R_b, uint8_t transformation,
                        float stepsize, sprec_t *ibuf, const element_siz &bp0, const element_siz &bp1,
-                       const element_siz &p0, const element_siz &p1, const uint16_t &num_layers,
-                       const element_siz &codeblock_size, const uint8_t &Cmodes);
+                       const element_siz &p0, const element_siz &p1, const uint32_t stride,
+                       const uint16_t &num_layers, const element_siz &codeblock_size,
+                       const uint8_t &Cmodes);
   ~j2k_precinct_subband() {
     delete inclusion_info;
     delete ZBP_info;
@@ -253,7 +257,7 @@ class j2c_packet {
   uint32_t length;
 
   j2c_packet()
-      : layer(0), resolution(0), component(0), precinct(0), header(nullptr), body(nullptr), length(0){};
+      : layer(0), resolution(0), component(0), precinct(0), header(nullptr), body(nullptr), length(0) {};
   // constructor for decoding
   j2c_packet(const uint16_t l, const uint8_t r, const uint16_t c, const uint32_t p,
              buf_chain *const h = nullptr, buf_chain *const bo = nullptr)
diff --git a/source/core/transform/dwt.hpp b/source/core/transform/dwt.hpp
index 5d50078..f65d398 100644
--- a/source/core/transform/dwt.hpp
+++ b/source/core/transform/dwt.hpp
@@ -60,11 +60,11 @@ constexpr int32_t Dshift = 16;
 // define pointer to FDWT functions
 typedef void (*fdwt_1d_filtr_func_fixed)(sprec_t *, const int32_t, const int32_t, const int32_t);
 typedef void (*fdwt_ver_filtr_func_fixed)(sprec_t *, const int32_t, const int32_t, const int32_t,
-                                          const int32_t);
+                                          const int32_t, const int32_t stride);
 // define pointer to IDWT functions
 typedef void (*idwt_1d_filtd_func_fixed)(sprec_t *, const int32_t, const int32_t, const int32_t);
 typedef void (*idwt_ver_filtd_func_fixed)(sprec_t *, const int32_t, const int32_t, const int32_t,
-                                          const int32_t);
+                                          const int32_t, const int32_t stride);
 
 // symmetric extension
 static inline int32_t PSEo(const int32_t i, const int32_t i0, const int32_t i1) {
@@ -90,8 +90,10 @@ static inline void dwt_1d_extr_fixed(T *extbuf, T *buf, const int32_t left, cons
 #if defined(OPENHTJ2K_ENABLE_ARM_NEON)
 void fdwt_1d_filtr_irrev97_fixed_neon(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
 void fdwt_1d_filtr_rev53_fixed_neon(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
-void fdwt_irrev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
-void fdwt_rev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
+void fdwt_irrev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1,
+                                  int32_t stride);
+void fdwt_rev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1,
+                                int32_t stride);
 
 #elif defined(OPENHTJ2K_ENABLE_AVX2)
 void fdwt_1d_filtr_irrev97_fixed_avx2(sprec_t *X, const int32_t left, const int32_t u_i0,
@@ -115,8 +117,10 @@ void fdwt_2d_sr_fixed(sprec_t *previousLL, sprec_t *LL, sprec_t *HL, sprec_t *LH
 #if defined(OPENHTJ2K_ENABLE_ARM_NEON)
 void idwt_1d_filtr_rev53_fixed_neon(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
 void idwt_1d_filtr_irrev97_fixed_neon(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
-void idwt_irrev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
-void idwt_rev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
+void idwt_irrev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1,
+                                  int32_t stride);
+void idwt_rev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1,
+                                int32_t stride);
 #elif defined(OPENHTJ2K_ENABLE_AVX2)
 void idwt_1d_filtr_rev53_fixed_avx2(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
 void idwt_1d_filtr_irrev97_fixed_avx2(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
diff --git a/source/core/transform/fdwt.cpp b/source/core/transform/fdwt.cpp
index e1ef89d..c43f2c6 100644
--- a/source/core/transform/fdwt.cpp
+++ b/source/core/transform/fdwt.cpp
@@ -107,8 +107,7 @@ static inline void fdwt_1d_sr_fixed(sprec_t *buf, sprec_t *in, const int32_t lef
 
 // FDWT for horizontal direction
 static void fdwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                              const int32_t v1, const uint8_t transformation) {
-  const int32_t stride               = u1 - u0;
+                              const int32_t v1, const uint8_t transformation, const int32_t stride) {
   constexpr int32_t num_pse_i0[2][2] = {{4, 2}, {3, 1}};
   constexpr int32_t num_pse_i1[2][2] = {{3, 1}, {4, 2}};
   const int32_t left                 = num_pse_i0[u0 % 2][transformation];
@@ -118,9 +117,10 @@ static void fdwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c
     // one sample case
     for (int32_t row = 0; row < v1 - v0; ++row) {
       if (u0 % 2 == 0) {
-        in[row] = (transformation) ? in[row] : in[row];
+        in[row * stride] = (transformation) ? in[row * stride] : in[row * stride];
       } else {
-        in[row] = (transformation) ? static_cast<sprec_t>(in[row] << 1) : in[row];
+        in[row * stride] =
+            (transformation) ? static_cast<sprec_t>(in[row * stride] << 1) : in[row * stride];
       }
     }
   } else {
@@ -138,8 +138,7 @@ static void fdwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c
 }
 
 void fdwt_irrev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                             const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                             const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {4, 3};
   constexpr int32_t num_pse_i1[2] = {3, 4};
   const int32_t top               = num_pse_i0[v0 % 2];
@@ -215,8 +214,7 @@ void fdwt_irrev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, co
 }
 
 void fdwt_rev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                           const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                           const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {2, 1};
   constexpr int32_t num_pse_i1[2] = {1, 2};
   const int32_t top               = num_pse_i0[v0 % 2];
@@ -280,8 +278,8 @@ void fdwt_rev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, cons
 // Deinterleaving to devide coefficients into subbands
 static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t *const HL,
                                        sprec_t *const LH, sprec_t *const HH, const int32_t u0,
-                                       const int32_t u1, const int32_t v0, const int32_t v1) {
-  const int32_t stride     = u1 - u0;
+                                       const int32_t u1, const int32_t v0, const int32_t v1,
+                                       const int32_t stride) {
   const int32_t v_offset   = v0 % 2;
   const int32_t u_offset   = u0 % 2;
   sprec_t *dp[4]           = {LL, HL, LH, HH};
@@ -291,41 +289,43 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
   const int32_t ustop[4]   = {ceil_int(u1, 2), u1 / 2, ceil_int(u1, 2), u1 / 2};
   const int32_t voffset[4] = {v_offset, v_offset, 1 - v_offset, 1 - v_offset};
   const int32_t uoffset[4] = {u_offset, 1 - u_offset, u_offset, 1 - u_offset};
+  const int32_t stride2[4] = {round_up(ustop[0] - ustart[0], 32), round_up(ustop[1] - ustart[1], 32),
+                              round_up(ustop[2] - ustart[2], 32), round_up(ustop[3] - ustart[3], 32)};
 
 #if defined(OPENHTJ2K_ENABLE_ARM_NEON)
   if ((ustop[0] - ustart[0]) != (ustop[1] - ustart[1])) {
     for (uint8_t b = 0; b < 2; ++b) {
       for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+        sprec_t *line = dp[b] + v * stride2[b];
         for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-          *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
+          *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
         }
       }
     }
   } else {
     sprec_t *first, *second;
-    first  = LL;
-    second = HL;
+    first  = dp[0];
+    second = dp[1];
     if (uoffset[0] > uoffset[1]) {
-      first  = HL;
-      second = LL;
+      first  = dp[1];
+      second = dp[0];
     }
     for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) {
-      sprec_t *sp = buf + (2 * v + voffset[0]) * stride;
-      size_t len  = static_cast<size_t>(ustop[0] - ustart[0]);
+      sprec_t *sp    = buf + (2 * v + voffset[0]) * stride;
+      size_t len     = static_cast<size_t>(ustop[0] - ustart[0]);
+      sprec_t *line0 = first + v * stride2[0];
+      sprec_t *line1 = second + v * stride2[0];
       for (; len >= 8; len -= 8) {
-        openhtj2k_arm_prefetch2(sp, 0);
-        openhtj2k_arm_prefetch2(first, 1);
-        openhtj2k_arm_prefetch2(second, 1);
         auto vline = vld2q_s16(sp);
-        vst1q_s16(first, vline.val[0]);
-        vst1q_s16(second, vline.val[1]);
-        first += 8;
-        second += 8;
+        vst1q_s16(line0, vline.val[0]);
+        vst1q_s16(line1, vline.val[1]);
+        line0 += 8;
+        line1 += 8;
         sp += 16;
       }
       for (; len > 0; --len) {
-        *first++  = *sp++;
-        *second++ = *sp++;
+        *line0++ = *sp++;
+        *line1++ = *sp++;
       }
     }
   }
@@ -333,8 +333,9 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
   if ((ustop[2] - ustart[2]) != (ustop[3] - ustart[3])) {
     for (uint8_t b = 2; b < 4; ++b) {
       for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+        sprec_t *line = dp[b] + v * stride2[b];
         for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-          *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
+          *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
         }
       }
     }
@@ -347,22 +348,21 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
       second = LH;
     }
     for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) {
-      sprec_t *sp = buf + (2 * v + voffset[2]) * stride;
-      size_t len  = static_cast<size_t>(ustop[2] - ustart[2]);
+      sprec_t *sp    = buf + (2 * v + voffset[2]) * stride;
+      size_t len     = static_cast<size_t>(ustop[2] - ustart[2]);
+      sprec_t *line0 = first + v * stride2[2];
+      sprec_t *line1 = second + v * stride2[2];
       for (; len >= 8; len -= 8) {
-        openhtj2k_arm_prefetch2(sp, 0);
-        openhtj2k_arm_prefetch2(first, 1);
-        openhtj2k_arm_prefetch2(second, 1);
         auto vline = vld2q_s16(sp);
-        vst1q_s16(first, vline.val[0]);
-        vst1q_s16(second, vline.val[1]);
-        first += 8;
-        second += 8;
+        vst1q_s16(line0, vline.val[0]);
+        vst1q_s16(line1, vline.val[1]);
+        line0 += 8;
+        line1 += 8;
         sp += 16;
       }
       for (; len > 0; --len) {
-        *first++  = *sp++;
-        *second++ = *sp++;
+        *line0++ = *sp++;
+        *line1++ = *sp++;
       }
     }
   }
@@ -384,7 +384,7 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
       second = LL;
     }
     const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14,
-                                             11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
+                                            11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
     for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) {
       sprec_t *sp = buf + (2 * v + voffset[0]) * stride;
       size_t len  = static_cast<size_t>(ustop[0] - ustart[0]);
@@ -433,7 +433,7 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
       second = LH;
     }
     const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14,
-                                             11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
+                                            11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
     for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) {
       sprec_t *sp = buf + (2 * v + voffset[2]) * stride;
       size_t len  = static_cast<size_t>(ustop[2] - ustart[2]);
@@ -467,8 +467,9 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
 #else
   for (uint8_t b = 0; b < 4; ++b) {
     for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+      sprec_t *line = dp[b] + v * stride2[b];
       for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-        *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
+        *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
       }
     }
   }
@@ -479,8 +480,9 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
 void fdwt_2d_sr_fixed(sprec_t *previousLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH,
                       const int32_t u0, const int32_t u1, const int32_t v0, const int32_t v1,
                       const uint8_t transformation) {
-  sprec_t *src = previousLL;
-  fdwt_ver_sr_fixed[transformation](src, u0, u1, v0, v1);
-  fdwt_hor_sr_fixed(src, u0, u1, v0, v1, transformation);
-  fdwt_2d_deinterleave_fixed(src, LL, HL, LH, HH, u0, u1, v0, v1);
+  const int32_t stride = round_up(u1 - u0, 32);
+  sprec_t *src         = previousLL;
+  fdwt_ver_sr_fixed[transformation](src, u0, u1, v0, v1, stride);
+  fdwt_hor_sr_fixed(src, u0, u1, v0, v1, transformation, stride);
+  fdwt_2d_deinterleave_fixed(src, LL, HL, LH, HH, u0, u1, v0, v1, stride);
 }
\ No newline at end of file
diff --git a/source/core/transform/fdwt_neon.cpp b/source/core/transform/fdwt_neon.cpp
index 8091b6b..033c759 100644
--- a/source/core/transform/fdwt_neon.cpp
+++ b/source/core/transform/fdwt_neon.cpp
@@ -261,8 +261,7 @@ auto fdwt_irrev97_fixed_neon_ver_step3 = [](const int32_t simdlen, int16_t *cons
     };
 
 void fdwt_irrev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                  const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                                  const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {4, 3};
   constexpr int32_t num_pse_i1[2] = {3, 4};
   const int32_t top               = num_pse_i0[v0 % 2];
@@ -344,8 +343,7 @@ void fdwt_irrev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u
 
 // reversible FDWT
 void fdwt_rev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                                const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {2, 1};
   constexpr int32_t num_pse_i1[2] = {1, 2};
   const int32_t top               = num_pse_i0[v0 % 2];
diff --git a/source/core/transform/idwt.cpp b/source/core/transform/idwt.cpp
index 5eba173..35ab00d 100644
--- a/source/core/transform/idwt.cpp
+++ b/source/core/transform/idwt.cpp
@@ -102,8 +102,7 @@ static void idwt_1d_sr_fixed(sprec_t *buf, sprec_t *in, const int32_t left, cons
 }
 
 static void idwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                              const int32_t v1, const uint8_t transformation) {
-  const int32_t stride               = u1 - u0;
+                              const int32_t v1, const uint8_t transformation, const int32_t stride) {
   constexpr int32_t num_pse_i0[2][2] = {{3, 1}, {4, 2}};
   constexpr int32_t num_pse_i1[2][2] = {{4, 2}, {3, 1}};
   const int32_t left                 = num_pse_i0[u0 % 2][transformation];
@@ -114,7 +113,7 @@ static void idwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c
     for (int32_t row = 0; row < v1 - v0; ++row) {
       //      in[row] = in[row];
       if (u0 % 2 != 0 && transformation) {
-        in[row] = static_cast<sprec_t>(in[row] >> 1);
+        in[row * stride] = static_cast<sprec_t>(in[row * stride] >> 1);
       }
     }
   } else {
@@ -131,8 +130,7 @@ static void idwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c
 }
 
 void idwt_irrev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                             const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                             const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {3, 4};
   constexpr int32_t num_pse_i1[2] = {4, 3};
   const int32_t top               = num_pse_i0[v0 % 2];
@@ -204,8 +202,7 @@ void idwt_irrev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, co
 }
 
 void idwt_rev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                           const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                           const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {1, 2};
   constexpr int32_t num_pse_i1[2] = {2, 1};
   const int32_t top               = num_pse_i0[v0 % 2];
@@ -263,8 +260,7 @@ void idwt_rev_ver_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, cons
 }
 
 static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH,
-                                     int32_t u0, int32_t u1, int32_t v0, int32_t v1) {
-  const int32_t stride     = u1 - u0;
+                                     int32_t u0, int32_t u1, int32_t v0, int32_t v1, const int32_t stride) {
   const int32_t v_offset   = v0 % 2;
   const int32_t u_offset   = u0 % 2;
   sprec_t *sp[4]           = {LL, HL, LH, HH};
@@ -274,49 +270,50 @@ static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, spr
   const int32_t ustop[4]   = {ceil_int(u1, 2), u1 / 2, ceil_int(u1, 2), u1 / 2};
   const int32_t voffset[4] = {v_offset, v_offset, 1 - v_offset, 1 - v_offset};
   const int32_t uoffset[4] = {u_offset, 1 - u_offset, u_offset, 1 - u_offset};
+  const int32_t stride2[4] = {round_up(ustop[0] - ustart[0], 32), round_up(ustop[1] - ustart[1], 32),
+                              round_up(ustop[2] - ustart[2], 32), round_up(ustop[3] - ustart[3], 32)};
 
 #if defined(OPENHTJ2K_ENABLE_ARM_NEON)
   if ((ustop[0] - ustart[0]) != (ustop[1] - ustart[1])) {
     for (uint8_t b = 0; b < 2; ++b) {
       for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+        sprec_t *line = sp[b] + v * stride2[b];
         for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-          buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++);
+          buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++);
         }
       }
     }
   } else {
     sprec_t *first, *second;
-    first  = LL;
-    second = HL;
+    first  = sp[0];
+    second = sp[1];
     if (uoffset[0] > uoffset[1]) {
-      first  = HL;
-      second = LL;
+      first  = sp[1];
+      second = sp[0];
     }
-    openhtj2k_arm_prefetch(first);
-    openhtj2k_arm_prefetch(second);
     int16x8_t vfirst0, vfirst1, vsecond0, vsecond1;
     //    int16x8x2_t vdst0, vdst1;
     for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) {
-      sprec_t *dp = buf + (2 * v + voffset[0]) * stride;
-      size_t len  = static_cast<size_t>(ustop[0] - ustart[0]);
+      sprec_t *dp    = buf + (2 * v + voffset[0]) * stride;
+      size_t len     = static_cast<size_t>(ustop[0] - ustart[0]);
+      sprec_t *line0 = first + v * stride2[0];
+      sprec_t *line1 = second + v * stride2[0];
       for (; len >= 16; len -= 16) {
-        vfirst0  = vld1q_s16(first);
-        vsecond0 = vld1q_s16(second);
+        vfirst0  = vld1q_s16(line0);
+        vsecond0 = vld1q_s16(line1);
         vst1q_s16(dp, vzip1q_s16(vfirst0, vsecond0));
         vst1q_s16(dp + 8, vzip2q_s16(vfirst0, vsecond0));
-        vfirst1  = vld1q_s16(first + 8);
-        vsecond1 = vld1q_s16(second + 8);
+        vfirst1  = vld1q_s16(line0 + 8);
+        vsecond1 = vld1q_s16(line1 + 8);
         vst1q_s16(dp + 16, vzip1q_s16(vfirst1, vsecond1));
         vst1q_s16(dp + 24, vzip2q_s16(vfirst1, vsecond1));
-        first += 16;
-        second += 16;
-        openhtj2k_arm_prefetch(first);
-        openhtj2k_arm_prefetch(second);
+        line0 += 16;
+        line1 += 16;
         dp += 32;
       }
       for (; len > 0; --len) {
-        *dp++ = *first++;
-        *dp++ = *second++;
+        *dp++ = *line0++;
+        *dp++ = *line1++;
       }
     }
   }
@@ -324,44 +321,43 @@ static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, spr
   if ((ustop[2] - ustart[2]) != (ustop[3] - ustart[3])) {
     for (uint8_t b = 2; b < 4; ++b) {
       for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+        sprec_t *line = sp[b] + v * stride2[b];
         for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-          buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++);
+          buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++);
         }
       }
     }
   } else {
     sprec_t *first, *second;
-    first  = LH;
-    second = HH;
+    first  = sp[2];
+    second = sp[3];
     if (uoffset[2] > uoffset[3]) {
-      first  = HH;
-      second = LH;
+      first  = sp[3];
+      second = sp[2];
     }
-    openhtj2k_arm_prefetch(first);
-    openhtj2k_arm_prefetch(second);
     int16x8_t vfirst0, vfirst1, vsecond0, vsecond1;
     //    int16x8x2_t vdst0, vdst1;
     for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) {
-      sprec_t *dp = buf + (2 * v + voffset[2]) * stride;
-      size_t len  = static_cast<size_t>(ustop[2] - ustart[2]);
+      sprec_t *dp    = buf + (2 * v + voffset[2]) * stride;
+      size_t len     = static_cast<size_t>(ustop[2] - ustart[2]);
+      sprec_t *line0 = first + v * stride2[2];
+      sprec_t *line1 = second + v * stride2[2];
       for (; len >= 16; len -= 16) {
-        vfirst0  = vld1q_s16(first);
-        vsecond0 = vld1q_s16(second);
+        vfirst0  = vld1q_s16(line0);
+        vsecond0 = vld1q_s16(line1);
         vst1q_s16(dp, vzip1q_s16(vfirst0, vsecond0));
         vst1q_s16(dp + 8, vzip2q_s16(vfirst0, vsecond0));
-        vfirst1  = vld1q_s16(first + 8);
-        vsecond1 = vld1q_s16(second + 8);
+        vfirst1  = vld1q_s16(line0 + 8);
+        vsecond1 = vld1q_s16(line1 + 8);
         vst1q_s16(dp + 16, vzip1q_s16(vfirst1, vsecond1));
         vst1q_s16(dp + 24, vzip2q_s16(vfirst1, vsecond1));
-        first += 16;
-        second += 16;
-        openhtj2k_arm_prefetch(first);
-        openhtj2k_arm_prefetch(second);
+        line0 += 16;
+        line1 += 16;
         dp += 32;
       }
       for (; len > 0; --len) {
-        *dp++ = *first++;
-        *dp++ = *second++;
+        *dp++ = *line0++;
+        *dp++ = *line1++;
       }
     }
   }
@@ -474,8 +470,9 @@ static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, spr
 #else
   for (uint8_t b = 0; b < 4; ++b) {
     for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+      sprec_t *line = sp[b] + v * stride2[b];
       for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-        buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++);
+        buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++);
       }
     }
   }
@@ -485,11 +482,12 @@ static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, spr
 void idwt_2d_sr_fixed(sprec_t *nextLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH, const int32_t u0,
                       const int32_t u1, const int32_t v0, const int32_t v1, const uint8_t transformation,
                       uint8_t normalizing_upshift) {
-  const int32_t buf_length = (u1 - u0) * (v1 - v0);
+  const int32_t stride     = round_up(u1 - u0, 32);
+  const int32_t buf_length = stride * (v1 - v0);
   sprec_t *src             = nextLL;
-  idwt_2d_interleave_fixed(src, LL, HL, LH, HH, u0, u1, v0, v1);
-  idwt_hor_sr_fixed(src, u0, u1, v0, v1, transformation);
-  idwt_ver_sr_fixed[transformation](src, u0, u1, v0, v1);
+  idwt_2d_interleave_fixed(src, LL, HL, LH, HH, u0, u1, v0, v1, stride);
+  idwt_hor_sr_fixed(src, u0, u1, v0, v1, transformation, stride);
+  idwt_ver_sr_fixed[transformation](src, u0, u1, v0, v1, stride);
 
   // scaling for 16bit width fixed-point representation
   if (transformation != 1 && normalizing_upshift) {
diff --git a/source/core/transform/idwt_neon.cpp b/source/core/transform/idwt_neon.cpp
index d1891c3..7ef7305 100644
--- a/source/core/transform/idwt_neon.cpp
+++ b/source/core/transform/idwt_neon.cpp
@@ -257,8 +257,7 @@ auto idwt_irrev97_fixed_neon_ver_step3 = [](const int32_t simdlen, int16_t *cons
     };
 
 void idwt_irrev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                  const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                                  const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {3, 4};
   constexpr int32_t num_pse_i1[2] = {4, 3};
   const int32_t top               = num_pse_i0[v0 % 2];
@@ -336,8 +335,7 @@ void idwt_irrev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u
 
 // reversible IDWT
 void idwt_rev_ver_sr_fixed_neon(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                                const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {1, 2};
   constexpr int32_t num_pse_i1[2] = {2, 1};
   const int32_t top               = num_pse_i0[v0 % 2];

From 38fad87fa0410de01431b8eb46947df65aeb0fb0 Mon Sep 17 00:00:00 2001
From: Osamu Watanabe <owatanab@es.takushoku-u.ac.jp>
Date: Fri, 9 Aug 2024 17:02:45 +0900
Subject: [PATCH 4/6] Add stride access for DWT with AVX2

---
 source/apps/imgcmp/image_class.hpp            |   8 +-
 source/core/coding/coding_units.cpp           |  24 +--
 source/core/coding/coding_units.hpp           |  17 +-
 source/core/coding/ht_block_decoding.cpp      |  38 ++--
 source/core/coding/ht_block_decoding.hpp      |   2 +-
 source/core/coding/ht_block_encoding_avx2.hpp |  10 +-
 source/core/transform/color.cpp               |   2 +-
 source/core/transform/dwt.hpp                 |  18 +-
 source/core/transform/fdwt.cpp                | 124 +++++------
 source/core/transform/fdwt_avx2.cpp           |   6 +-
 source/core/transform/idwt.cpp                | 192 +++++++++---------
 source/core/transform/idwt_avx2.cpp           |   6 +-
 12 files changed, 228 insertions(+), 219 deletions(-)

diff --git a/source/apps/imgcmp/image_class.hpp b/source/apps/imgcmp/image_class.hpp
index f80150d..317a882 100644
--- a/source/apps/imgcmp/image_class.hpp
+++ b/source/apps/imgcmp/image_class.hpp
@@ -70,7 +70,7 @@ class image {
         bitDepth(0),
         isSigned(false),
         isBigendian(false),
-        data(nullptr) {};
+        data(nullptr){};
   // // destructor
   ~image() { delete[] data; }
 
@@ -110,9 +110,9 @@ class image {
 
   // parsing PNM/PGX header
   int read_pnmpgx(const char *name) {
-    constexpr char SP = ' ';
-    constexpr char LF = '\n';
-    constexpr char CR = 13;
+    constexpr char SP                  = ' ';
+    constexpr char LF                  = '\n';
+    [[maybe_unused]] constexpr char CR = 13;
 
     FILE *fp = fopen(name, "rb");
     if (fp == nullptr) {
diff --git a/source/core/coding/coding_units.cpp b/source/core/coding/coding_units.cpp
index c707483..eb8af08 100644
--- a/source/core/coding/coding_units.cpp
+++ b/source/core/coding/coding_units.cpp
@@ -267,10 +267,10 @@ void j2k_codeblock::create_compressed_buffer(buf_chain *tile_buf, int32_t buf_li
  *******************************************************************************/
 j2k_precinct_subband::j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uint8_t R_b,
                                            uint8_t transformation, float stepsize, sprec_t *ibuf,
-                                           const element_siz &bp0, const element_siz &bp1,
-                                           const element_siz &p0, const element_siz &p1,
-                                           const uint32_t band_stride, const uint16_t &num_layers,
-                                           const element_siz &codeblock_size, const uint8_t &Cmodes)
+                                           const element_siz &bp0, const element_siz &p0,
+                                           const element_siz &p1, const uint32_t band_stride,
+                                           const uint16_t &num_layers, const element_siz &codeblock_size,
+                                           const uint8_t &Cmodes)
     : j2k_region(p0, p1),
       orientation(orientation),
       inclusion_info(nullptr),
@@ -1028,8 +1028,8 @@ j2k_precinct::j2k_precinct(const uint8_t &r, const uint32_t &idx, const element_
                              ceil_int(pos1.y - yob[subband[i]->orientation], sr));
     this->pband[i] = MAKE_UNIQUE<j2k_precinct_subband>(
         subband[i]->orientation, subband[i]->M_b, subband[i]->R_b, subband[i]->transformation,
-        subband[i]->delta, subband[i]->i_samples, subband[i]->pos0, subband[i]->pos1, pbpos0, pbpos1,
-        subband[i]->stride, num_layers, codeblock_size, Cmodes);
+        subband[i]->delta, subband[i]->i_samples, subband[i]->pos0, pbpos0, pbpos1, subband[i]->stride,
+        num_layers, codeblock_size, Cmodes);
   }
 }
 
@@ -1402,7 +1402,7 @@ void j2k_tile_component::init(j2k_main_header *hdr, j2k_tilepart_header *tphdr,
   const uint32_t aligned_stride =
       round_up((ceil_int(pos1.x, 1U << tile->reduce_NL) - ceil_int(pos0.x, 1U << tile->reduce_NL)), 32U);
   const auto height             = static_cast<uint32_t>(ceil_int(pos1.y, 1U << tile->reduce_NL)
-                                                        - ceil_int(pos0.y, 1U << tile->reduce_NL));
+                                            - ceil_int(pos0.y, 1U << tile->reduce_NL));
   const uint32_t num_bufsamples = aligned_stride * height;
   samples = static_cast<int32_t *>(aligned_mem_alloc(sizeof(int32_t) * num_bufsamples, 32));
 
@@ -2459,7 +2459,7 @@ void j2k_tile::decode() {
 #endif
             }
           }  // end of codeblock loop
-        }  // end of subbnad loop
+        }    // end of subbnad loop
 #ifdef OPENHTJ2K_THREAD
         for (auto &result : results) {
           result.get();
@@ -2507,7 +2507,7 @@ void j2k_tile::decode() {
 
     // copy samples in resolution buffer to that in tile component buffer
     uint32_t height = tc1.y - tc0.y;
-    uint32_t width  = round_up(tc1.x - tc0.x, 32);
+    uint32_t width  = round_up(tc1.x - tc0.x, 32U);
     uint32_t stride = round_up(width, 32U);
     // size_t num_samples = static_cast<size_t>(tc1.x - tc0.x) * (tc1.y - tc0.y);
 #if defined(OPENHTJ2K_ENABLE_ARM_NEON)
@@ -3013,7 +3013,7 @@ uint8_t *j2k_tile::encode() {
 #if defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__)
     for (uint32_t y = 0; y < height; ++y) {
       int32_t *sp             = src + y * stride;
-      sprec_t *dp             = cr->i_samples + y * (bottom_right.x - top_left.x);
+      sprec_t *dp             = cr->i_samples + y * stride;
       uint32_t num_tc_samples = bottom_right.x - top_left.x;
       for (; num_tc_samples >= 16; num_tc_samples -= 16) {
         __m256i v0 = _mm256_load_si256((__m256i *)sp);
@@ -3030,7 +3030,7 @@ uint8_t *j2k_tile::encode() {
 #elif defined(OPENHTJ2K_ENABLE_ARM_NEON)
     for (uint32_t y = 0; y < height; ++y) {
       int32_t *sp             = src + y * stride;
-      sprec_t *dp             = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32);
+      sprec_t *dp             = cr->i_samples + y * stride;
       uint32_t num_tc_samples = bottom_right.x - top_left.x;
       for (; num_tc_samples >= 8; num_tc_samples -= 8) {
         auto vsrc0 = vld1q_s32(sp);
@@ -3046,7 +3046,7 @@ uint8_t *j2k_tile::encode() {
 #else
       for (uint32_t y = 0; y < height; ++y) {
         int32_t *sp             = src + y * stride;
-        sprec_t *dp             = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32);
+        sprec_t *dp             = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32U);
         uint32_t num_tc_samples = bottom_right.x - top_left.x;
         for (; num_tc_samples > 0; --num_tc_samples) {
           *dp++ = static_cast<sprec_t>(*sp++);
diff --git a/source/core/coding/coding_units.hpp b/source/core/coding/coding_units.hpp
index d417798..a9d5963 100644
--- a/source/core/coding/coding_units.hpp
+++ b/source/core/coding/coding_units.hpp
@@ -58,7 +58,7 @@ class j2k_region {
   // set bottom-right coordinate (exclusive)
   void set_pos1(element_siz in) { pos1 = in; }
   j2k_region() = default;
-  j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1), stride(round_up(pos1.x - pos0.x, 32)) {}
+  j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1), stride(round_up(pos1.x - pos0.x, 32U)) {}
 };
 
 /********************************************************************************
@@ -179,10 +179,9 @@ class j2k_precinct_subband : public j2k_region {
   uint32_t num_codeblock_x;
   uint32_t num_codeblock_y;
   j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uint8_t R_b, uint8_t transformation,
-                       float stepsize, sprec_t *ibuf, const element_siz &bp0, const element_siz &bp1,
-                       const element_siz &p0, const element_siz &p1, const uint32_t stride,
-                       const uint16_t &num_layers, const element_siz &codeblock_size,
-                       const uint8_t &Cmodes);
+                       float stepsize, sprec_t *ibuf, const element_siz &bp0, const element_siz &p0,
+                       const element_siz &p1, const uint32_t stride, const uint16_t &num_layers,
+                       const element_siz &codeblock_size, const uint8_t &Cmodes);
   ~j2k_precinct_subband() {
     delete inclusion_info;
     delete ZBP_info;
@@ -257,7 +256,7 @@ class j2c_packet {
   uint32_t length;
 
   j2c_packet()
-      : layer(0), resolution(0), component(0), precinct(0), header(nullptr), body(nullptr), length(0) {};
+      : layer(0), resolution(0), component(0), precinct(0), header(nullptr), body(nullptr), length(0){};
   // constructor for decoding
   j2c_packet(const uint16_t l, const uint8_t r, const uint16_t c, const uint32_t p,
              buf_chain *const h = nullptr, buf_chain *const bo = nullptr)
@@ -316,7 +315,7 @@ class j2k_resolution : public j2k_region {
   void scale();
   void destroy() {
     aligned_mem_free(i_samples);
-    for (auto b = 0; b < num_bands; ++b) {
+    for (uint8_t b = 0; b < num_bands; ++b) {
       if (subbands != nullptr) {
         subbands[b]->destroy();
       }
@@ -424,7 +423,7 @@ class j2k_tile_component : public j2k_tile_base {
   void perform_dc_offset(uint8_t transformation, bool is_signed);
 
   void destroy() {
-    for (auto r = 0; r < this->NL; ++r) {
+    for (uint8_t r = 0; r < this->NL; ++r) {
       if (resolution != nullptr) {
         auto p = resolution[r].get();
         if (p != nullptr) resolution[r]->destroy();
@@ -495,7 +494,7 @@ class j2k_tile : public j2k_tile_base {
  public:
   j2k_tile();
   void destroy() {
-    for (auto c = 0; c < this->num_components; ++c) {
+    for (uint16_t c = 0; c < this->num_components; ++c) {
       tcomp[c].destroy();
     }
   }
diff --git a/source/core/coding/ht_block_decoding.cpp b/source/core/coding/ht_block_decoding.cpp
index becde89..2fddb92 100644
--- a/source/core/coding/ht_block_decoding.cpp
+++ b/source/core/coding/ht_block_decoding.cpp
@@ -265,7 +265,7 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
     for (uint32_t x = 0; x < block->size.x; sp += 2, ++vp) {
       uint32_t inf = sp[0];
       uint32_t U_q = sp[1];
-      if (U_q > ((30 - pLSB) + 2)) {
+      if (U_q > ((30U - pLSB) + 2U)) {
         printf("ERROR\n");
       }
 
@@ -279,14 +279,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         MagSgn.advance(m_n);                                // consume m_n
 
         val = ms_val << 31;                      // get sign bit
-        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
         v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
         v_n |= 1;                                // add center of bin
         // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
         // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
         val |= (v_n + 2) << (pLSB - 1);
       }
-      dp[0] = val;
+      dp[0] = static_cast<int32_t>(val);
 
       v_n = 0;
       val = 0;
@@ -298,14 +298,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         MagSgn.advance(m_n);                                // consume m_n
 
         val = ms_val << 31;                      // get sign bit
-        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
         v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
         v_n |= 1;                                // add center of bin
         // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
         // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
         val |= (v_n + 2) << (pLSB - 1);
       }
-      dp[block->blksampl_stride] = val;
+      dp[block->blksampl_stride] = static_cast<int32_t>(val);
       vp[0]                      = prev_v_n | v_n;
       prev_v_n                   = 0;
       ++dp;
@@ -323,14 +323,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         MagSgn.advance(m_n);                                // consume m_n
 
         val = ms_val << 31;                      // get sign bit
-        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
         v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
         v_n |= 1;                                // add center of bin
         // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
         // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
         val |= (v_n + 2) << (pLSB - 1);
       }
-      dp[0] = val;
+      dp[0] = static_cast<int32_t>(val);
 
       v_n = 0;
       val = 0;
@@ -342,14 +342,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         MagSgn.advance(m_n);                                // consume m_n
 
         val = ms_val << 31;                      // get sign bit
-        v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+        v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
         v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
         v_n |= 1;                                // add center of bin
         // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
         // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
         val |= (v_n + 2) << (pLSB - 1);
       }
-      dp[block->blksampl_stride] = val;
+      dp[block->blksampl_stride] = static_cast<int32_t>(val);
       prev_v_n                   = v_n;
       ++dp;
       ++x;
@@ -357,7 +357,7 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
     vp[0] = prev_v_n;
 
     for (uint32_t y = 2; y < block->size.y; y += 2) {
-      uint16_t *sp = scratch + (y >> 1) * sstr;
+      uint16_t *sp = scratch + (y >> 1) * static_cast<uint32_t>(sstr);
       uint32_t *vp = v_n_scratch;
       int32_t *dp  = block->sample_buf + y * block->blksampl_stride;
 
@@ -373,7 +373,7 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
         uint32_t kappa = gamma ? emax : 1;
 
         uint32_t U_q = u_q + kappa;
-        if (U_q > ((30 - pLSB) + 2)) {
+        if (U_q > ((30U - pLSB) + 2U)) {
           printf("ERROR\n");
         }
 
@@ -387,14 +387,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
           MagSgn.advance(m_n);                                // consume m_n
 
           val = ms_val << 31;                      // get sign bit
-          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
           v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
           v_n |= 1;                                // add center of bin
           // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
           // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
           val |= (v_n + 2) << (pLSB - 1);
         }
-        dp[0] = val;
+        dp[0] = static_cast<int32_t>(val);
 
         v_n = 0;
         val = 0;
@@ -406,14 +406,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
           MagSgn.advance(m_n);                                // consume m_n
 
           val = ms_val << 31;                      // get sign bit
-          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
           v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
           v_n |= 1;                                // add center of bin
           // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
           // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
           val |= (v_n + 2) << (pLSB - 1);
         }
-        dp[block->blksampl_stride] = val;
+        dp[block->blksampl_stride] = static_cast<int32_t>(val);
         vp[0]                      = prev_v_n | v_n;
         prev_v_n                   = 0;
         ++dp;
@@ -431,14 +431,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
           MagSgn.advance(m_n);                                // consume m_n
 
           val = ms_val << 31;                      // get sign bit
-          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
           v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
           v_n |= 1;                                // add center of bin
           // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
           // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
           val |= (v_n + 2) << (pLSB - 1);
         }
-        dp[0] = val;
+        dp[0] = static_cast<int32_t>(val);
 
         v_n = 0;
         val = 0;
@@ -450,14 +450,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
           MagSgn.advance(m_n);                                // consume m_n
 
           val = ms_val << 31;                      // get sign bit
-          v_n = ms_val & ((1 << m_n) - 1);         // keep only m_n bits
+          v_n = ms_val & ((1U << m_n) - 1U);       // keep only m_n bits
           v_n |= ((inf >> (8 + bit)) & 1) << m_n;  // add EMB e_1 as MSB
           v_n |= 1;                                // add center of bin
           // v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
           // add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
           val |= (v_n + 2) << (pLSB - 1);
         }
-        dp[block->blksampl_stride] = val;
+        dp[block->blksampl_stride] = static_cast<int32_t>(val);
         prev_v_n                   = v_n;
         ++dp;
         ++x;
diff --git a/source/core/coding/ht_block_decoding.hpp b/source/core/coding/ht_block_decoding.hpp
index 4752d81..cd2c185 100644
--- a/source/core/coding/ht_block_decoding.hpp
+++ b/source/core/coding/ht_block_decoding.hpp
@@ -866,7 +866,7 @@ class fwd_buf {
    */
   FORCE_INLINE void advance(uint32_t num_bits) {
     // if (!num_bits) return;
-    if (!(num_bits >= 0 && num_bits <= this->bits && num_bits < 128)) {
+    if (!(num_bits <= this->bits && num_bits < 128)) {
       printf("Value of numbits = %d is out of range.\n", num_bits);
       throw std::exception();
     }
diff --git a/source/core/coding/ht_block_encoding_avx2.hpp b/source/core/coding/ht_block_encoding_avx2.hpp
index 8f4a44d..6c971fb 100644
--- a/source/core/coding/ht_block_encoding_avx2.hpp
+++ b/source/core/coding/ht_block_encoding_avx2.hpp
@@ -85,22 +85,22 @@ class state_MS_enc {
     uint32_t t = 0;
 
     // _bzhi_u32(UINT32_MAX, len) = ((1U << len) - 1U)
-    tmp = val & ((1 << (8 - stuff)) - 1);
+    tmp = val & ((1U << (8U - stuff)) - 1U);
     t |= tmp;
     bits_local += 8 - stuff;
     stuff = (tmp == 0xFF);
 
-    tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1);
+    tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U);
     t |= tmp << 8;
     bits_local += 8 - stuff;
     stuff = (tmp == 0xFF);
 
-    tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1);
+    tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U);
     t |= tmp << 16;
     bits_local += 8 - stuff;
     stuff = (tmp == 0xFF);
 
-    tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1);
+    tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U);
     t |= tmp << 24;
     bits_local += 8 - stuff;
     last = tmp & 0xFF;
@@ -168,7 +168,7 @@ class state_MS_enc {
     }
 #else
     for (int i = 0; i < 4; ++i) {
-      Creg |= static_cast<__uint128_t>(_mm_cvtsi128_si32(v)) << ctreg;
+      Creg |= static_cast<uint64_t>(static_cast<__uint128_t>(_mm_cvtsi128_si32(v)) << ctreg);
       ctreg += static_cast<uint32_t>(_mm_cvtsi128_si32(m));
       v = _mm_srli_si128(v, 4);
       m = _mm_srli_si128(m, 4);
diff --git a/source/core/transform/color.cpp b/source/core/transform/color.cpp
index 48df1ab..f63aba9 100644
--- a/source/core/transform/color.cpp
+++ b/source/core/transform/color.cpp
@@ -26,7 +26,7 @@
 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#if !defined(OPENHTJ2K_TRY_AVX2) && !defined(__AVX2__) && !defined(OPENHTJ2K_ENABLE_ARM_NEON)
+#if not defined(OPENHTJ2K_TRY_AVX2) && not defined(OPENHTJ2K_ENABLE_ARM_NEON)
   #include "color.hpp"
 
 void cvt_rgb_to_ycbcr_rev(int32_t *sp0, int32_t *sp1, int32_t *sp2, uint32_t width, uint32_t height) {
diff --git a/source/core/transform/dwt.hpp b/source/core/transform/dwt.hpp
index f65d398..e34c546 100644
--- a/source/core/transform/dwt.hpp
+++ b/source/core/transform/dwt.hpp
@@ -100,14 +100,14 @@ void fdwt_1d_filtr_irrev97_fixed_avx2(sprec_t *X, const int32_t left, const int3
                                       const int32_t u_i1);
 void fdwt_1d_filtr_rev53_fixed_avx2(sprec_t *X, const int32_t left, const int32_t u_i0, const int32_t u_i1);
 void fdwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                  const int32_t v1);
+                                  const int32_t v1, const int32_t stride);
 void fdwt_rev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                const int32_t v1);
+                                const int32_t v1, const int32_t stride);
 #else
 void fdwt_1d_filtr_irrev97_fixed(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
 void fdwt_1d_filtr_rev53_fixed(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
-void fdwt_irrev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
-void fdwt_rev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
+void fdwt_irrev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, int32_t stride);
+void fdwt_rev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, int32_t stride);
 #endif
 
 void fdwt_2d_sr_fixed(sprec_t *previousLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH, int32_t u0,
@@ -124,13 +124,15 @@ void idwt_rev_ver_sr_fixed_neon(sprec_t *in, int32_t u0, int32_t u1, int32_t v0,
 #elif defined(OPENHTJ2K_ENABLE_AVX2)
 void idwt_1d_filtr_rev53_fixed_avx2(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
 void idwt_1d_filtr_irrev97_fixed_avx2(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
-void idwt_irrev_ver_sr_fixed_avx2(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
-void idwt_rev_ver_sr_fixed_avx2(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
+void idwt_irrev_ver_sr_fixed_avx2(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1,
+                                  int32_t stride);
+void idwt_rev_ver_sr_fixed_avx2(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1,
+                                int32_t stride);
 #else
 void idwt_1d_filtr_rev53_fixed(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
 void idwt_1d_filtr_irrev97_fixed(sprec_t *X, int32_t left, int32_t u_i0, int32_t u_i1);
-void idwt_irrev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
-void idwt_rev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1);
+void idwt_irrev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, int32_t stride);
+void idwt_rev_ver_sr_fixed(sprec_t *in, int32_t u0, int32_t u1, int32_t v0, int32_t v1, int32_t stride);
 #endif
 void idwt_2d_sr_fixed(sprec_t *nextLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sprec_t *HH, int32_t u0,
                       int32_t u1, int32_t v0, int32_t v1, uint8_t transformation,
diff --git a/source/core/transform/fdwt.cpp b/source/core/transform/fdwt.cpp
index c43f2c6..0459573 100644
--- a/source/core/transform/fdwt.cpp
+++ b/source/core/transform/fdwt.cpp
@@ -127,7 +127,7 @@ static void fdwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c
     // need to perform symmetric extension
     const int32_t len = u1 - u0 + left + right;
     auto *Xext        = static_cast<sprec_t *>(aligned_mem_alloc(
-        sizeof(sprec_t) * static_cast<size_t>(round_up(len + SIMD_PADDING, SIMD_PADDING)), 32));
+               sizeof(sprec_t) * static_cast<size_t>(round_up(len + SIMD_PADDING, SIMD_PADDING)), 32));
     // #pragma omp parallel for
     for (int32_t row = 0; row < v1 - v0; ++row) {
       fdwt_1d_sr_fixed(Xext, in, left, right, u0, u1, transformation);
@@ -341,11 +341,11 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
     }
   } else {
     sprec_t *first, *second;
-    first  = LH;
-    second = HH;
+    first  = dp[2];
+    second = dp[3];
     if (uoffset[2] > uoffset[3]) {
-      first  = HH;
-      second = LH;
+      first  = dp[3];
+      second = dp[2];
     }
     for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) {
       sprec_t *sp    = buf + (2 * v + voffset[2]) * stride;
@@ -368,28 +368,31 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
   }
 #elif defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__)
   if ((ustop[0] - ustart[0]) != (ustop[1] - ustart[1])) {
-    for (uint8_t b = 0; b < 2; ++b) {
-      for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
-        for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-          *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
+     for (uint8_t b = 0; b < 2; ++b) {
+       for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+         sprec_t *line = dp[b] + v * stride2[b];
+         for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
+           *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
         }
       }
     }
   } else {
-    sprec_t *first, *second;
-    first  = LL;
-    second = HL;
-    if (uoffset[0] > uoffset[1]) {
-      first  = HL;
-      second = LL;
-    }
-    const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14,
-                                            11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
-    for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) {
-      sprec_t *sp = buf + (2 * v + voffset[0]) * stride;
-      size_t len  = static_cast<size_t>(ustop[0] - ustart[0]);
-      for (; len >= 8; len -= 8) {
-        // SSE version
+     sprec_t *first, *second;
+     first  = dp[0];
+     second = dp[1];
+     if (uoffset[0] > uoffset[1]) {
+       first  = dp[1];
+       second = dp[0];
+    }
+     const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14,
+                                             11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
+     for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) {
+       sprec_t *sp    = buf + (2 * v + voffset[0]) * stride;
+       size_t len     = static_cast<size_t>(ustop[0] - ustart[0]);
+       sprec_t *line0 = first + v * stride2[0];
+       sprec_t *line1 = second + v * stride2[0];
+       for (; len >= 8; len -= 8) {
+         // SSE version
         // auto vline0 = _mm_loadu_si128((__m128i *)sp);
         // auto vline1 = _mm_loadu_si128((__m128i *)(sp + 8));
         // vline0      = _mm_shufflelo_epi16(vline0, _MM_SHUFFLE(3, 1, 2, 0));
@@ -398,47 +401,50 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
         // vline1      = _mm_shufflehi_epi16(vline1, _MM_SHUFFLE(3, 1, 2, 0));
         // vline0      = _mm_shuffle_epi32(vline0, _MM_SHUFFLE(3, 1, 2, 0));  // A1 A2 A3 A4 B1 B2 B3 B4
         // vline1      = _mm_shuffle_epi32(vline1, _MM_SHUFFLE(3, 1, 2, 0));  // A5 A6 A7 A8 B5 B6 B7 B8
-        // _mm_storeu_si128((__m128i *)first, _mm_unpacklo_epi64(vline0, vline1));
-        // _mm_storeu_si128((__m128i *)second, _mm_unpackhi_epi64(vline0, vline1));
+        // _mm_storeu_si128((__m128i *)line0, _mm_unpacklo_epi64(vline0, vline1));
+        // _mm_storeu_si128((__m128i *)line1, _mm_unpackhi_epi64(vline0, vline1));
 
         __m256i vline = _mm256_loadu_si256((__m256i *)sp);
         vline         = _mm256_shuffle_epi8(vline, vshmask);
         vline         = _mm256_permute4x64_epi64(vline, 0xD8);
-        _mm256_storeu2_m128i((__m128i *)second, (__m128i *)first, vline);
-        first += 8;
-        second += 8;
+        _mm256_storeu2_m128i((__m128i *)line1, (__m128i *)line0, vline);
+        line0 += 8;
+        line1 += 8;
         sp += 16;
       }
       for (; len > 0; --len) {
-        *first++  = *sp++;
-        *second++ = *sp++;
+         *line0++ = *sp++;
+         *line1++ = *sp++;
       }
     }
   }
 
   if ((ustop[2] - ustart[2]) != (ustop[3] - ustart[3])) {
-    for (uint8_t b = 2; b < 4; ++b) {
-      for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
-        for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-          *(dp[b]++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
+     for (uint8_t b = 2; b < 4; ++b) {
+       for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+         sprec_t *line = dp[b] + v * stride2[b];
+         for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
+           *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
         }
       }
     }
   } else {
-    sprec_t *first, *second;
-    first  = LH;
-    second = HH;
-    if (uoffset[2] > uoffset[3]) {
-      first  = HH;
-      second = LH;
-    }
-    const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14,
-                                            11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
-    for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) {
-      sprec_t *sp = buf + (2 * v + voffset[2]) * stride;
-      size_t len  = static_cast<size_t>(ustop[2] - ustart[2]);
-      for (; len >= 8; len -= 8) {
-        // SSE version
+     sprec_t *first, *second;
+     first  = dp[2];
+     second = dp[3];
+     if (uoffset[2] > uoffset[3]) {
+       first  = dp[3];
+       second = dp[2];
+    }
+     const __m256i vshmask = _mm256_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14,
+                                             11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
+     for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) {
+       sprec_t *sp    = buf + (2 * v + voffset[2]) * stride;
+       size_t len     = static_cast<size_t>(ustop[2] - ustart[2]);
+       sprec_t *line0 = first + v * stride2[2];
+       sprec_t *line1 = second + v * stride2[2];
+       for (; len >= 8; len -= 8) {
+         // SSE version
         // auto vline0 = _mm_loadu_si128((__m128i *)sp);
         // auto vline1 = _mm_loadu_si128((__m128i *)(sp + 8));
         // vline0      = _mm_shufflelo_epi16(vline0, _MM_SHUFFLE(3, 1, 2, 0));
@@ -447,29 +453,29 @@ static void fdwt_2d_deinterleave_fixed(sprec_t *buf, sprec_t *const LL, sprec_t
         // vline1      = _mm_shufflehi_epi16(vline1, _MM_SHUFFLE(3, 1, 2, 0));
         // vline0      = _mm_shuffle_epi32(vline0, _MM_SHUFFLE(3, 1, 2, 0));  // A1 A2 A3 A4 B1 B2 B3 B4
         // vline1      = _mm_shuffle_epi32(vline1, _MM_SHUFFLE(3, 1, 2, 0));  // A5 A6 A7 A8 B5 B6 B7 B8
-        // _mm_storeu_si128((__m128i *)first, _mm_unpacklo_epi64(vline0, vline1));
-        // _mm_storeu_si128((__m128i *)second, _mm_unpackhi_epi64(vline0, vline1));
+        // _mm_storeu_si128((__m128i *)line0, _mm_unpacklo_epi64(vline0, vline1));
+        // _mm_storeu_si128((__m128i *)line1, _mm_unpackhi_epi64(vline0, vline1));
 
         __m256i vline = _mm256_loadu_si256((__m256i *)sp);
         vline         = _mm256_shuffle_epi8(vline, vshmask);
         vline         = _mm256_permute4x64_epi64(vline, 0xD8);
-        _mm256_storeu2_m128i((__m128i *)second, (__m128i *)first, vline);
-        first += 8;
-        second += 8;
+        _mm256_storeu2_m128i((__m128i *)line1, (__m128i *)line0, vline);
+        line0 += 8;
+        line1 += 8;
         sp += 16;
       }
       for (; len > 0; --len) {
-        *first++  = *sp++;
-        *second++ = *sp++;
+         *line0++ = *sp++;
+         *line1++ = *sp++;
       }
     }
   }
 #else
   for (uint8_t b = 0; b < 4; ++b) {
-    for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
-      sprec_t *line = dp[b] + v * stride2[b];
-      for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-        *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
+     for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+       sprec_t *line = dp[b] + v * stride2[b];
+       for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
+         *(line++) = buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride];
       }
     }
   }
diff --git a/source/core/transform/fdwt_avx2.cpp b/source/core/transform/fdwt_avx2.cpp
index b1e449a..cc12f04 100644
--- a/source/core/transform/fdwt_avx2.cpp
+++ b/source/core/transform/fdwt_avx2.cpp
@@ -338,8 +338,7 @@ auto fdwt_irrev97_fixed_avx2_ver_step3 = [](const int32_t simdlen, int16_t *cons
 };
 
 void fdwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                  const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                                  const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {4, 3};
   constexpr int32_t num_pse_i1[2] = {3, 4};
   const int32_t top               = num_pse_i0[v0 % 2];
@@ -421,8 +420,7 @@ void fdwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u
 
 // reversible FDWT
 void fdwt_rev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                                const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {2, 1};
   constexpr int32_t num_pse_i1[2] = {1, 2};
   const int32_t top               = num_pse_i0[v0 % 2];
diff --git a/source/core/transform/idwt.cpp b/source/core/transform/idwt.cpp
index 35ab00d..a2161d9 100644
--- a/source/core/transform/idwt.cpp
+++ b/source/core/transform/idwt.cpp
@@ -120,7 +120,7 @@ static void idwt_hor_sr_fixed(sprec_t *in, const int32_t u0, const int32_t u1, c
     // need to perform symmetric extension
     const int32_t len = u1 - u0 + left + right;
     auto *Yext        = static_cast<sprec_t *>(aligned_mem_alloc(
-        sizeof(sprec_t) * static_cast<size_t>(round_up(len + SIMD_PADDING, SIMD_PADDING)), 32));
+               sizeof(sprec_t) * static_cast<size_t>(round_up(len + SIMD_PADDING, SIMD_PADDING)), 32));
     for (int32_t row = 0; row < v1 - v0; ++row) {
       idwt_1d_sr_fixed(Yext, in, left, right, u0, u1, transformation);
       in += stride;
@@ -363,116 +363,122 @@ static void idwt_2d_interleave_fixed(sprec_t *buf, sprec_t *LL, sprec_t *HL, spr
   }
 #elif defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__)
   if ((ustop[0] - ustart[0]) != (ustop[1] - ustart[1])) {
-    for (uint8_t b = 0; b < 2; ++b) {
-      for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
-        for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-          buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++);
+     for (uint8_t b = 0; b < 2; ++b) {
+       for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+         sprec_t *line = sp[b] + v * stride2[b];
+         for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
+           buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++);
         }
       }
     }
   } else {
-    sprec_t *first, *second;
-    first  = LL;
-    second = HL;
-    if (uoffset[0] > uoffset[1]) {
-      first  = HL;
-      second = LL;
-    }
-    for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) {
-      sprec_t *dp = buf + (2 * v + voffset[0]) * stride;
-      size_t len  = static_cast<size_t>(ustop[0] - ustart[0]);
-      // SSE version
-      //  for (; len >= 8; len -= 8) {
-      //    auto vfirst  = _mm_loadu_si128((__m128i *)first);
-      //    auto vsecond = _mm_loadu_si128((__m128i *)second);
-      //    auto vtmp0   = _mm_unpacklo_epi16(vfirst, vsecond);
-      //    auto vtmp1   = _mm_unpackhi_epi16(vfirst, vsecond);
-      //    _mm_storeu_si128((__m128i *)dp, vtmp0);
-      //    _mm_storeu_si128((__m128i *)(dp + 8), vtmp1);
-      //    first += 8;
-      //    second += 8;
-      //    dp += 16;
-      // }
+     sprec_t *first, *second;
+     first  = sp[0];
+     second = sp[1];
+     if (uoffset[0] > uoffset[1]) {
+       first  = sp[1];
+       second = sp[0];
+    }
+     for (int32_t v = 0, vb = vstart[0]; vb < vstop[0]; ++vb, ++v) {
+       sprec_t *dp    = buf + (2 * v + voffset[0]) * stride;
+       size_t len     = static_cast<size_t>(ustop[0] - ustart[0]);
+       sprec_t *line0 = first + v * stride2[0];
+       sprec_t *line1 = second + v * stride2[0];
+       // SSE version
+       //  for (; len >= 8; len -= 8) {
+       //    auto vfirst  = _mm_loadu_si128((__m128i *)line0);
+       //    auto vsecond = _mm_loadu_si128((__m128i *)line1);
+       //    auto vtmp0   = _mm_unpacklo_epi16(vfirst, vsecond);
+       //    auto vtmp1   = _mm_unpackhi_epi16(vfirst, vsecond);
+       //    _mm_storeu_si128((__m128i *)dp, vtmp0);
+       //    _mm_storeu_si128((__m128i *)(dp + 8), vtmp1);
+       //    line0 += 8;
+       //    line1 += 8;
+       //    dp += 16;
+       // }
 
-      // AVX2 version
-      __m256i vfirst, vsecond;
-      for (; len >= 16; len -= 16) {
-        vfirst     = _mm256_loadu_si256((__m256i *)first);
-        vsecond    = _mm256_loadu_si256((__m256i *)second);
-        auto vtmp0 = _mm256_unpacklo_epi16(vfirst, vsecond);
-        auto vtmp1 = _mm256_unpackhi_epi16(vfirst, vsecond);
+       // AVX2 version
+       __m256i vfirst, vsecond;
+       for (; len >= 16; len -= 16) {
+         vfirst     = _mm256_loadu_si256((__m256i *)line0);
+         vsecond    = _mm256_loadu_si256((__m256i *)line1);
+         auto vtmp0 = _mm256_unpacklo_epi16(vfirst, vsecond);
+         auto vtmp1 = _mm256_unpackhi_epi16(vfirst, vsecond);
 
-        _mm256_storeu_si256((__m256i *)dp, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x20));
-        _mm256_storeu_si256((__m256i *)dp + 1, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x31));
-        first += 16;
-        second += 16;
-        dp += 32;
+         _mm256_storeu_si256((__m256i *)dp, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x20));
+         _mm256_storeu_si256((__m256i *)dp + 1, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x31));
+         line0 += 16;
+         line1 += 16;
+         dp += 32;
       }
-      for (; len > 0; --len) {
-        *dp++ = *first++;
-        *dp++ = *second++;
+       for (; len > 0; --len) {
+         *dp++ = *line0++;
+         *dp++ = *line1++;
       }
     }
   }
 
   if ((ustop[2] - ustart[2]) != (ustop[3] - ustart[3])) {
-    for (uint8_t b = 2; b < 4; ++b) {
-      for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
-        for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-          buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(sp[b]++);
+     for (uint8_t b = 2; b < 4; ++b) {
+       for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+         sprec_t *line = sp[b] + v * stride2[b];
+         for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
+           buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++);
         }
       }
     }
   } else {
-    sprec_t *first, *second;
-    first  = LH;
-    second = HH;
-    if (uoffset[2] > uoffset[3]) {
-      first  = HH;
-      second = LH;
-    }
-    for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) {
-      sprec_t *dp = buf + (2 * v + voffset[2]) * stride;
-      size_t len  = static_cast<size_t>(ustop[2] - ustart[2]);
-      // SSE version
-      //  for (; len >= 8; len -= 8) {
-      //    auto vfirst  = _mm_loadu_si128((__m128i *)first);
-      //    auto vsecond = _mm_loadu_si128((__m128i *)second);
-      //    auto vtmp0   = _mm_unpacklo_epi16(vfirst, vsecond);
-      //    auto vtmp1   = _mm_unpackhi_epi16(vfirst, vsecond);
-      //    _mm_storeu_si128((__m128i *)dp, vtmp0);
-      //    _mm_storeu_si128((__m128i *)(dp + 8), vtmp1);
-      //    first += 8;
-      //    second += 8;
-      //    dp += 16;
-      // }
+     sprec_t *first, *second;
+     first  = sp[2];
+     second = sp[3];
+     if (uoffset[2] > uoffset[3]) {
+       first  = sp[3];
+       second = sp[2];
+    }
+     for (int32_t v = 0, vb = vstart[2]; vb < vstop[2]; ++vb, ++v) {
+       sprec_t *dp    = buf + (2 * v + voffset[2]) * stride;
+       size_t len     = static_cast<size_t>(ustop[2] - ustart[2]);
+       sprec_t *line0 = first + v * stride2[2];
+       sprec_t *line1 = second + v * stride2[2];
+       // SSE version
+       //  for (; len >= 8; len -= 8) {
+       //    auto vfirst  = _mm_loadu_si128((__m128i *)line0);
+       //    auto vsecond = _mm_loadu_si128((__m128i *)line1);
+       //    auto vtmp0   = _mm_unpacklo_epi16(vfirst, vsecond);
+       //    auto vtmp1   = _mm_unpackhi_epi16(vfirst, vsecond);
+       //    _mm_storeu_si128((__m128i *)dp, vtmp0);
+       //    _mm_storeu_si128((__m128i *)(dp + 8), vtmp1);
+       //    line0 += 8;
+       //    line1 += 8;
+       //    dp += 16;
+       // }
 
-      // AVX2 version
-      __m256i vfirst, vsecond;
-      for (; len >= 16; len -= 16) {
-        vfirst     = _mm256_loadu_si256((__m256i *)first);
-        vsecond    = _mm256_loadu_si256((__m256i *)second);
-        auto vtmp0 = _mm256_unpacklo_epi16(vfirst, vsecond);
-        auto vtmp1 = _mm256_unpackhi_epi16(vfirst, vsecond);
+       // AVX2 version
+       __m256i vfirst, vsecond;
+       for (; len >= 16; len -= 16) {
+         vfirst     = _mm256_loadu_si256((__m256i *)line0);
+         vsecond    = _mm256_loadu_si256((__m256i *)line1);
+         auto vtmp0 = _mm256_unpacklo_epi16(vfirst, vsecond);
+         auto vtmp1 = _mm256_unpackhi_epi16(vfirst, vsecond);
 
-        _mm256_storeu_si256((__m256i *)dp, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x20));
-        _mm256_storeu_si256((__m256i *)dp + 1, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x31));
-        first += 16;
-        second += 16;
-        dp += 32;
+         _mm256_storeu_si256((__m256i *)dp, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x20));
+         _mm256_storeu_si256((__m256i *)dp + 1, _mm256_permute2x128_si256(vtmp0, vtmp1, 0x31));
+         line0 += 16;
+         line1 += 16;
+         dp += 32;
       }
-      for (; len > 0; --len) {
-        *dp++ = *first++;
-        *dp++ = *second++;
+       for (; len > 0; --len) {
+         *dp++ = *line0++;
+         *dp++ = *line1++;
       }
     }
   }
 #else
   for (uint8_t b = 0; b < 4; ++b) {
-    for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
-      sprec_t *line = sp[b] + v * stride2[b];
-      for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
-        buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++);
+     for (int32_t v = 0, vb = vstart[b]; vb < vstop[b]; ++vb, ++v) {
+       sprec_t *line = sp[b] + v * stride2[b];
+       for (int32_t u = 0, ub = ustart[b]; ub < ustop[b]; ++ub, ++u) {
+         buf[2 * u + uoffset[b] + (2 * v + voffset[b]) * stride] = *(line++);
       }
     }
   }
@@ -511,19 +517,19 @@ void idwt_2d_sr_fixed(sprec_t *nextLL, sprec_t *LL, sprec_t *HL, sprec_t *LH, sp
     }
 #elif defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__)
     for (; len >= 16; len -= 16) {
-      __m256i tmp0 = _mm256_load_si256((__m256i *)src);
-      __m256i tmp1 = _mm256_slli_epi16(tmp0, static_cast<int32_t>(normalizing_upshift));
-      _mm256_store_si256((__m256i *)src, tmp1);
-      src += 16;
+       __m256i tmp0 = _mm256_load_si256((__m256i *)src);
+       __m256i tmp1 = _mm256_slli_epi16(tmp0, static_cast<int32_t>(normalizing_upshift));
+       _mm256_store_si256((__m256i *)src, tmp1);
+       src += 16;
     }
     for (; len > 0; --len) {
-      // cast to unsigned to avoid undefined behavior
+       // cast to unsigned to avoid undefined behavior
       *src = static_cast<sprec_t>(static_cast<usprec_t>(*src) << normalizing_upshift);
       src++;
     }
 #else
     for (; len > 0; --len) {
-      // cast to unsigned to avoid undefined behavior
+       // cast to unsigned to avoid undefined behavior
       *src = static_cast<sprec_t>(static_cast<usprec_t>(*src) << normalizing_upshift);
       src++;
     }
diff --git a/source/core/transform/idwt_avx2.cpp b/source/core/transform/idwt_avx2.cpp
index c4a46cd..3ea6a91 100644
--- a/source/core/transform/idwt_avx2.cpp
+++ b/source/core/transform/idwt_avx2.cpp
@@ -258,8 +258,7 @@ auto idwt_irrev97_fixed_avx2_ver_step = [](const int32_t simdlen, int16_t *const
 };
 
 void idwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                  const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                                  const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {3, 4};
   constexpr int32_t num_pse_i1[2] = {4, 3};
   const int32_t top               = num_pse_i0[v0 % 2];
@@ -337,8 +336,7 @@ void idwt_irrev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u
 
 // reversible IDWT
 void idwt_rev_ver_sr_fixed_avx2(sprec_t *in, const int32_t u0, const int32_t u1, const int32_t v0,
-                                const int32_t v1) {
-  const int32_t stride            = u1 - u0;
+                                const int32_t v1, const int32_t stride) {
   constexpr int32_t num_pse_i0[2] = {1, 2};
   constexpr int32_t num_pse_i1[2] = {2, 1};
   const int32_t top               = num_pse_i0[v0 % 2];

From b9f2848a4dc1eee5b71b4c3fd66a0cf82350a153 Mon Sep 17 00:00:00 2001
From: OSAMU WATANABE <owatanab@es.takushoku-u.ac.jp>
Date: Fri, 15 Nov 2024 11:43:39 +0900
Subject: [PATCH 5/6] - Fix incorrect packet parsing for RPCL, PCRL, CPRL -
 Change cmake configuration for MIngW environments

---
 CHANGELOG                                 |  6 ++++++
 CMakeLists.txt                            |  4 +---
 source/core/coding/coding_units.cpp       | 14 +++++---------
 source/core/common/open_htj2k_version.hpp |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index df24fd7..0306b68 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,9 @@
+# [0.2.8] - 2024-11-12
+
+* Fix incorrect packet parsing for RPCL, PCRL, CPRL
+* Introduce stride access into DWT
+* Change cmake configuration for MinGW environments
+
 # [0.2.7] - 2024-06-13
 
 * Refactor non-SIMD HT cleanup decoding
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b3de7a1..b73d1c7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,9 +153,7 @@ endif()
 
 if(NOT EMSCRIPTEN)
   if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^[xX]86_64$|^[aA][mM][dD]64$") # x86_64
-    if(NOT MINGW)
-      option(ENABLE_AVX2 "Enable the use of Intel AVX2 intrinsics" ON)
-    endif()
+    option(ENABLE_AVX2 "Enable the use of Intel AVX2 intrinsics" ON)
     if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
       set(CMAKE_CXX_FLAGS
           "${CMAKE_CXX_FLAGS} /arch:AVX2 /EHsc /D \"_CRT_SECURE_NO_WARNINGS\"")
diff --git a/source/core/coding/coding_units.cpp b/source/core/coding/coding_units.cpp
index eb8af08..555f495 100644
--- a/source/core/coding/coding_units.cpp
+++ b/source/core/coding/coding_units.cpp
@@ -551,9 +551,7 @@ void j2k_precinct_subband::parse_packet_header(buf_chain *packet_header, uint16_
             if (!(block->Cmodes & HT_MIXED)) {
               // Must be the first HT Cleanup pass
               if (segment_bytes < 2) {
-                printf(
-                    "ERROR: Length information for a HT-codeblock is "
-                    "invalid\n");
+                printf("ERROR: Length information for a HT-codeblock is invalid\n");
                 throw std::exception();
               }
               next_segment_passes = 2;
@@ -595,9 +593,7 @@ void j2k_precinct_subband::parse_packet_header(buf_chain *packet_header, uint16_
                 if (block->Cmodes & HT_MIXED) {
                   block->Cmodes &= static_cast<uint16_t>(~(HT_PHLD | HT));
                 } else {
-                  printf(
-                      "ERROR: Length information for a HT-codeblock is "
-                      "invalid\n");
+                  printf("ERROR: Length information for a HT-codeblock is invalid\n");
                   throw std::exception();
                 }
               }
@@ -1402,7 +1398,7 @@ void j2k_tile_component::init(j2k_main_header *hdr, j2k_tilepart_header *tphdr,
   const uint32_t aligned_stride =
       round_up((ceil_int(pos1.x, 1U << tile->reduce_NL) - ceil_int(pos0.x, 1U << tile->reduce_NL)), 32U);
   const auto height             = static_cast<uint32_t>(ceil_int(pos1.y, 1U << tile->reduce_NL)
-                                            - ceil_int(pos0.y, 1U << tile->reduce_NL));
+                                                        - ceil_int(pos0.y, 1U << tile->reduce_NL));
   const uint32_t num_bufsamples = aligned_stride * height;
   samples = static_cast<int32_t *>(aligned_mem_alloc(sizeof(int32_t) * num_bufsamples, 32));
 
@@ -2459,7 +2455,7 @@ void j2k_tile::decode() {
 #endif
             }
           }  // end of codeblock loop
-        }    // end of subbnad loop
+        }  // end of subbnad loop
 #ifdef OPENHTJ2K_THREAD
         for (auto &result : results) {
           result.get();
@@ -2667,7 +2663,7 @@ void j2k_tile::find_gcd_of_precinct_size(element_siz &out) {
     for (uint8_t r = 0; r <= this->tcomp[c].get_dwt_levels(); r++) {
       PP  = this->tcomp[c].get_precinct_size(r);
       PPx = (PPx > PP.x) ? static_cast<uint8_t>(PP.x) : PPx;
-      PPy = (PPy > PP.y) ? static_cast<uint8_t>(PP.y) : PPx;
+      PPy = (PPy > PP.y) ? static_cast<uint8_t>(PP.y) : PPy;
     }
   }
   out.x = PPx;
diff --git a/source/core/common/open_htj2k_version.hpp b/source/core/common/open_htj2k_version.hpp
index 028a808..ead0ea1 100644
--- a/source/core/common/open_htj2k_version.hpp
+++ b/source/core/common/open_htj2k_version.hpp
@@ -28,4 +28,4 @@
 
 #define OPENHTJ2K_VERSION_MAJOR 0
 #define OPENHTJ2K_VERSION_MINOR 2
-#define OPENHTJ2K_VERSION_PATCH 7
+#define OPENHTJ2K_VERSION_PATCH 8

From 8efd958cbdc5dc085f9edcd771be0af233d37d40 Mon Sep 17 00:00:00 2001
From: OSAMU WATANABE <owatanab@es.takushoku-u.ac.jp>
Date: Fri, 15 Nov 2024 12:58:34 +0900
Subject: [PATCH 6/6] Add .gitattributes for unexpected CRLF conversion

---
 .gitattributes | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..c20a730
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,17 @@
+# Set the default behavior, in case people don't have core.autocrlf set.
+* text=auto
+
+# Explicitly declare text files you want to always be normalized and converted
+# to native line endings on checkout.
+*.c text
+*.h text
+*.cpp text
+*.hpp text
+
+# Declare files that will always have CRLF line endings on checkout.
+*.sln text eol=crlf
+
+# Denote all files that are truly binary and should not be modified.
+*.pgx binary
+*.j2k binary
+*.ppm binary