diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml
index df021b0..94ca3eb 100644
--- a/.github/workflows/ccp-workflow.yml
+++ b/.github/workflows/ccp-workflow.yml
@@ -1,11 +1,16 @@
 # taken from https://github.com/onqtam/doctest/blob/master/.github/workflows/main.yml
 
 name: C/C++ CI
-on: push
+on: 
+  push:
+  pull_request:
+    types: [opened, reopened]
+
 
 jobs:
   build:
     strategy:
+      fail-fast: false
       matrix:
         include: [
           { system: MacOS,          runner: macos-latest },
@@ -25,6 +30,7 @@ jobs:
 
   build_windows:
     strategy:
+      fail-fast: false
       matrix:
         include: [
           { system: Windows,  runner: windows-latest },
@@ -42,9 +48,11 @@ jobs:
 
   test:
     strategy:
+      fail-fast: false
       matrix:
         include: [
-          { system: MacOS,          runner: macos-latest },
+          { system: MacOS-13,       runner: macos-13 },
+          { system: MacOS-latest,   runner: macos-latest },
           { system: Ubuntu-latest,  runner: ubuntu-latest },
         ]
     name: ${{ matrix.system }} Test
@@ -63,6 +71,7 @@ jobs:
       
   test_windows:
     strategy:
+      fail-fast: false
       matrix:
         include: [
           { system: Windows,  runner: windows-latest },
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index c2d527a..7d031b5 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -46,11 +46,11 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+      uses: github/codeql-action/init@v3
       with:
         languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -64,7 +64,7 @@ jobs:
     # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
-      uses: github/codeql-action/autobuild@v2
+      uses: github/codeql-action/autobuild@v3
 
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -77,6 +77,6 @@ jobs:
     #     ./location_of_script_within_repo/buildscript.sh
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      uses: github/codeql-action/analyze@v3
       with:
         category: "/language:${{matrix.language}}"
diff --git a/README.md b/README.md
index 15d6597..90064a7 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 Open source implementation of High-throughput JPEG2000 (HTJ2K), also known as JPH, JPEG2000 Part 15, ISO/IEC 15444-15, and ITU-T T.814. Here, we are interested in implementing the HTJ2K only, supporting features that are defined in JPEG2000 Part 1 (for example, for wavelet transform, only reversible 5/3 and irreversible 9/7 are supported).
 
-The interested reader is referred to the [short HTJ2K white paper](http://ds.jpeg.org/whitepapers/jpeg-htj2k-whitepaper.pdf), or the [extended HTJ2K white paper](https://htj2k.com/wp-content/uploads/white-paper.pdf) for more details on HTJ2K. [This](https://kakadusoftware.com/wp-content/uploads/icip2019.pdf) paper explores the attainable performance on CPU, while [this](https://kakadusoftware.com/wp-content/uploads/ICIP2019_GPU.pdf) and [this](https://webapps.unsworks.library.unsw.edu.au/fapi/datastream/unsworks:75139/bin990339e4-8805-4456-ae30-223d85f9b1c1) explores performance on the GPU.
+The interested reader is referred to the [short HTJ2K white paper](http://ds.jpeg.org/whitepapers/jpeg-htj2k-whitepaper.pdf), or the [extended HTJ2K white paper](https://htj2k.com/wp-content/uploads/white-paper.pdf) for more details on HTJ2K. [This](https://kakadusoftware.com/wp-content/uploads/icip2019.pdf) paper explores the attainable performance on CPU, while [this](https://kakadusoftware.com/wp-content/uploads/ICIP2019_GPU.pdf) and [this](http://hdl.handle.net/1959.4/unsworks_75139) explores performance on the GPU.
 
 # The standard #
 
@@ -17,4 +17,8 @@ The standard is available free of charge from [ITU website](https://www.itu.int/
 * [Compiling and Running in Docker](./docs/docker.md)
 * [Usage Example](./docs/usage_examples.md)
 * [Web-based Demos](./docs/web_demos.md)
-* [Doxygen Documentation Style](./docs/doxygen_style.md)
\ No newline at end of file
+* [Doxygen Documentation Style](./docs/doxygen_style.md)
+
+# Repositories #
+[![Packaging status](https://repology.org/badge/vertical-allrepos/openjph.svg)](https://repology.org/project/openjph/versions)
+
diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h
index 8e41493..c18ee76 100644
--- a/src/apps/common/ojph_img_io.h
+++ b/src/apps/common/ojph_img_io.h
@@ -54,7 +54,7 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   // defined elsewhere
   class mem_fixed_allocator;
-  struct line_buf;
+  class line_buf;
 
   ////////////////////////////////////////////////////////////////////////////
   //
@@ -135,7 +135,7 @@ namespace ojph {
 
     ui32 cur_line;
     si64 start_of_data;
-    int planar;
+    bool planar;
     ui32 bit_depth[3];
     bool is_signed[3];
     point subsampling[3];
@@ -446,6 +446,68 @@ namespace ojph {
     size_t buffer_size;
   };
 
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+  class pfm_in : public image_in_base
+  {
+  public:
+    pfm_in(mem_fixed_allocator *p = NULL)
+    {
+      fh = 0;
+      fname = NULL;
+      alloc_p = p;
+      temp_buf = NULL;
+      temp_buf_byte_size = 0;
+      bit_depth[0] = bit_depth[1] = bit_depth[2] = 32;
+      scale = 0.0f;
+      little_endian = true;
+      width = height = num_comps = 0;
+
+      cur_line = 0;
+      start_of_data = 0;
+    }
+    virtual ~pfm_in()
+    {
+      close();
+      if (alloc_p == NULL && temp_buf)
+        free(temp_buf);
+    }
+
+    void open(const char* filename);
+    void finalize_alloc();
+    void configure(ui32* bit_depth) {
+      assert(num_comps != 0);
+      for (ui32 c = 0; c < num_comps; ++c)
+        this->bit_depth[c] = bit_depth[c];
+    }
+    virtual ui32 read(const line_buf* line, ui32 comp_num);
+    void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; }
+
+    size get_size() { assert(fh); return size(width, height); }
+    ui32 get_width() { assert(fh); return width; }
+    ui32 get_height() { assert(fh); return height; }
+    ui32 get_num_components() { assert(fh); return num_comps; }
+
+  private:
+    FILE *fh;
+    const char *fname;
+    mem_fixed_allocator *alloc_p;
+    float *temp_buf;
+    size_t temp_buf_byte_size;
+    ui32 bit_depth[3];       // this truncates data to bit_depth in the LSB
+    float scale;
+    bool little_endian;
+    ui32 width, height, num_comps;
+    ui32 cur_line;
+    si64 start_of_data;
+  };
+
+
   ////////////////////////////////////////////////////////////////////////////
   // Accelerators (defined in ojph_img_io_*)
   typedef void (*conversion_fun)(const line_buf *ln0, const line_buf *ln1, 
@@ -559,7 +621,7 @@ namespace ojph {
     ui32 width, height, num_components;
     ui32 bit_depth, bytes_per_sample;
     ui8* buffer;
-    ui32 buffer_size;
+    size_t buffer_size;
     ui32 cur_line, samples_per_line, bytes_per_line;
     conversion_fun converter;
     const line_buf *lptr[3];
@@ -621,7 +683,7 @@ namespace ojph {
     ui32 bit_depth_of_data[4]; 
     ui32 bytes_per_sample;
     ui8* buffer;
-    ui32 buffer_size;
+    size_t buffer_size;
     ui32 cur_line, samples_per_line;
   };
 #endif /* OJPH_ENABLE_TIFF_SUPPORT */
@@ -698,11 +760,60 @@ namespace ojph {
     const char* fname;
     bool is_signed;
     ui32 bit_depth, bytes_per_sample;
-    si32 lower_val, upper_val;
+    si64 lower_val, upper_val;
     ui32 width;
     ui8* buffer;
     ui32 buffer_size;
   };
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+  class pfm_out : public image_out_base
+  {
+  public:
+    pfm_out()
+    {
+      fh = NULL;
+      fname = NULL;
+      buffer = NULL;
+      buffer_size = 0;
+      width = height = num_components = 0;
+      scale = -1.0f;
+      bit_depth[0] = bit_depth[1] = bit_depth[2] = 32;
+      cur_line = 0;
+      start_of_data = 0;
+    }
+    virtual ~pfm_out()
+    {
+      close();
+      if (buffer)
+        free(buffer);
+    }
+
+    void open(char* filename);
+    void configure(ui32 width, ui32 height, ui32 num_components, 
+                   float scale, ui32* bit_depth);
+    virtual ui32 write(const line_buf* line, ui32 comp_num);
+    virtual void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; }
+
+  private:
+    FILE *fh;
+    const char *fname;
+    float* buffer;
+    size_t buffer_size;
+    ui32 width, height, num_components;
+    float scale;
+    ui32 bit_depth[3];
+    ui32 cur_line;
+    si64 start_of_data;
+  };
+
+
 }
 
 #endif // !OJPH_IMG_IO_H
diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp
index 0c4aa0e..144c837 100644
--- a/src/apps/ojph_compress/ojph_compress.cpp
+++ b/src/apps/ojph_compress/ojph_compress.cpp
@@ -526,9 +526,9 @@ int main(int argc, char * argv[]) {
     std::cout <<
     "\nThe following arguments are necessary:\n"
 #ifdef OJPH_ENABLE_TIFF_SUPPORT
-    " -i input file name (either pgm, ppm, tif(f), or raw(yuv))\n"
+    " -i input file name (either pgm, ppm, pfm, tif(f), or raw(yuv))\n"
 #else
-    " -i input file name (either pgm, ppm, or raw(yuv))\n"
+    " -i input file name (either pgm, ppm, pfm, or raw(yuv))\n"
 #endif // !OJPH_ENABLE_TIFF_SUPPORT
     " -o output file name\n\n"
 
@@ -587,7 +587,33 @@ int main(int argc, char * argv[]) {
     "            component; for example: 12,10,10\n"
     " -downsamp  {x,y},{x,y},...,{x,y} a list of x,y points, one for each\n"
     "            component; for example {1,1},{2,2},{2,2}\n\n"
-    ;
+    "\n"
+
+    ".pfm files receive special treatment. Currently, lossy compression\n"
+    "with these files is not supported, only lossless. When these files are\n"
+    "used, the NLT segment marker is automatically inserted into the\n"
+    "codestream when needed, as explained shortly. The following arguments\n"
+    "can be useful for this file type.\n"
+    " -signed    a comma-separated list of true or false parameters, one\n"
+    "            for each component; for example: true,false,false.\n"
+    "            If you are sure that all sample values are positive or 0,\n"
+    "            set the corresponding entry to false; otherwise set it to\n"
+    "            true.\n"
+    "            When a component entry is set to true, an NLT segment\n"
+    "            marker segment is inserted into the codestream.\n"
+    "            The NLT segment specifies a non-linear transform that\n"
+    "            changes only negative values, producing better coding\n"
+    "            efficiency.\n"
+    "            The NLT segment marker might be less supported in other\n"
+    "            encoders.\n"
+    " -bit_depth a comma-separated list of bit depth values, one per \n"
+    "            component; for example: 12,10,10.\n"
+    "            Floating value numbers are treated as integers, and they\n"
+    "            are shifted to the right, keeping only the specified\n"
+    "            number of bits. Up to 32 bits (which is the default) are\n"
+    "            supported.\n"
+
+    "\n";
     return -1;
   }
   if (!get_arguments(argc, argv, input_filename, output_filename,
@@ -611,6 +637,7 @@ int main(int argc, char * argv[]) {
     ojph::codestream codestream;
 
     ojph::ppm_in ppm;
+    ojph::pfm_in pfm;
     ojph::yuv_in yuv;
     ojph::raw_in raw;
     ojph::dpx_in dpx;
@@ -736,6 +763,106 @@ int main(int argc, char * argv[]) {
 
         base = &ppm;
       }
+      else if (is_matching(".pfm", v))
+      {
+        pfm.open(input_filename);
+        ojph::param_siz siz = codestream.access_siz();
+        siz.set_image_extent(ojph::point(image_offset.x + pfm.get_width(),
+          image_offset.y + pfm.get_height()));
+        ojph::ui32 num_comps = pfm.get_num_components();
+        assert(num_comps == 1 || num_comps == 3);
+        siz.set_num_components(num_comps);
+
+        if (bit_depth[0] != 0)             // one was set
+          if (num_bit_depths < num_comps)  // but if not enough, repeat
+            for (ojph::ui32 c = num_bit_depths; c < num_comps; ++c)
+              bit_depth[c] = bit_depth[num_bit_depths - 1];
+        if (is_signed[0] != -1)            // one was set
+          if (num_is_signed < num_comps)   // but if not enough, repeat
+            for (ojph::ui32 c = num_is_signed; c < num_comps; ++c)
+              is_signed[c] = is_signed[num_is_signed - 1];
+
+        bool all_the_same = true;
+        if (num_comps == 3)
+        {
+          all_the_same = all_the_same 
+            && bit_depth[0] == bit_depth[1] 
+            && bit_depth[1] == bit_depth[2];
+          all_the_same = all_the_same
+            && is_signed[0] == is_signed[1]
+            && is_signed[1] == is_signed[2];
+        }
+
+        pfm.configure(bit_depth);
+        ojph::point ds(1, 1);
+        for (ojph::ui32 c = 0; c < num_comps; ++c) {
+          ojph::ui32 bd = 32;
+          if (bit_depth[c] != 0)
+            bd = bit_depth[c];
+          bool is = true;
+          if (is_signed[c] != -1)
+            is = is_signed[c] != 0;
+          siz.set_component(c, ds, bd, is);
+        }
+        siz.set_image_offset(image_offset);
+        siz.set_tile_size(tile_size);
+        siz.set_tile_offset(tile_offset);
+
+        ojph::param_cod cod = codestream.access_cod();
+        cod.set_num_decomposition(num_decompositions);
+        cod.set_block_dims(block_size.w, block_size.h);
+        if (num_precincts != -1)
+          cod.set_precinct_size(num_precincts, precinct_size);
+        cod.set_progression_order(prog_order);
+        if (num_comps == 1)
+        {
+          if (employ_color_transform != -1)
+            OJPH_WARN(0x01000092,
+              "-colour_trans option is not needed and was not used; "
+              "this is because the image has one component only\n");
+        }
+        else
+        {
+          if (employ_color_transform == -1)
+            cod.set_color_transform(true);
+          else
+            cod.set_color_transform(employ_color_transform == 1);
+        }
+        cod.set_reversible(reversible);
+        if (!reversible && quantization_step != -1.0f)
+          codestream.access_qcd().set_irrev_quant(quantization_step);
+
+        ojph::param_nlt nlt = codestream.access_nlt();
+        if (reversible) {
+          if (all_the_same)
+            nlt.set_type3_transformation(ojph::param_nlt::ALL_COMPS, true);
+          else
+            for (ojph::ui32 c = 0; c < num_comps; ++c)
+              nlt.set_type3_transformation(c, true);
+        }
+        else
+          OJPH_ERROR(0x01000093, "We currently support lossless only for "
+            "pfm images; this may change in the future.");
+
+        codestream.set_planar(false);
+        if (profile_string[0] != '\0')
+          codestream.set_profile(profile_string);
+        codestream.set_tilepart_divisions(tileparts_at_resolutions, 
+                                          tileparts_at_components);
+        codestream.request_tlm_marker(tlm_marker);          
+
+        if (dims.w != 0 || dims.h != 0)
+          OJPH_WARN(0x01000094,
+            "-dims option is not needed and was not used\n");
+        if (num_components != 0)
+          OJPH_WARN(0x01000095,
+            "-num_comps is not needed and was not used\n");
+        if (comp_downsampling[0].x != 0 || comp_downsampling[0].y != 0)
+          OJPH_WARN(0x01000096,
+            "-downsamp is not needed and was not used\n");
+
+        base = &pfm;
+      }
 #ifdef OJPH_ENABLE_TIFF_SUPPORT
       else if (is_matching(".tif", v) || is_matching(".tiff", v))
       {
diff --git a/src/apps/ojph_expand/ojph_expand.cpp b/src/apps/ojph_expand/ojph_expand.cpp
index 7d6f3d5..3d3b981 100644
--- a/src/apps/ojph_expand/ojph_expand.cpp
+++ b/src/apps/ojph_expand/ojph_expand.cpp
@@ -213,6 +213,7 @@ int main(int argc, char *argv[]) {
     ojph::codestream codestream;
 
     ojph::ppm_out ppm;
+    ojph::pfm_out pfm;
     #ifdef OJPH_ENABLE_TIFF_SUPPORT
     ojph::tif_out tif;
     #endif /* OJPH_ENABLE_TIFF_SUPPORT */
@@ -266,6 +267,59 @@ int main(int argc, char *argv[]) {
         ppm.open(output_filename);
         base = &ppm;
       }
+      else if (is_matching(".pfm", v))
+      {
+        codestream.set_planar(false);
+        ojph::param_siz siz = codestream.access_siz();
+        ojph::param_cod cod = codestream.access_cod();
+        ojph::param_nlt nlt = codestream.access_nlt();
+
+        ojph::ui32 num_comps = siz.get_num_components();
+        if (num_comps != 3 && num_comps != 1)
+          OJPH_ERROR(0x0200000C,
+            "The file has %d color components; this cannot be saved to"
+            " a .pfm file\n", num_comps);
+        bool all_same = true;
+        ojph::point p = siz.get_downsampling(0);
+        for (ojph::ui32 i = 1; i < siz.get_num_components(); ++i)
+        {
+          ojph::point p1 = siz.get_downsampling(i);
+          all_same = all_same && (p1.x == p.x) && (p1.y == p.y);
+        }
+        if (!all_same)
+          OJPH_ERROR(0x0200000D,
+            "To save an image to ppm, all the components must have the "
+            "same downsampling ratio\n");
+        ojph::ui32 bit_depth[3];
+        for (ojph::ui32 c = 0; c < siz.get_num_components(); ++c) {
+          ojph::ui8 bd = 0;
+          bool is = true;
+          bool result = nlt.get_type3_transformation(c, bd, is);
+          if (result == false)
+            OJPH_ERROR(0x0200000E,
+              "This codestream is not supported; it does not have an "
+              "NLT segment marker for this component (or no default NLT "
+              "settings) .\n");
+          if (bd != siz.get_bit_depth(c) || is != siz.is_signed(c))
+            OJPH_ERROR(0x0200000F,
+              "There is discrepancy in component %d configuration between "
+              "SIZ marker segment, which specifies bit_depth = %d and "
+              "signedness = %s, and NLT marker segment, which specifies "
+              "bit_depth = %d and signedness = %s.\n", c, 
+              siz.get_bit_depth(c), is != siz.is_signed(c) ? "True" : "False",
+              bd, is ? "True" : "False");
+          bit_depth[c] = bd;
+        }
+        if (!cod.is_reversible())
+          OJPH_ERROR(0x02000010,
+            "This codestream is lossy (not reversible), and we currently "
+            "only support reversible codestreams for .pfm target files. "
+            "This is only temporary and will be changed at some point.\n");
+        pfm.configure(siz.get_recon_width(0), siz.get_recon_height(0),
+          siz.get_num_components(), -1.0f, bit_depth);
+        pfm.open(output_filename);
+        base = &pfm;
+      }
 #ifdef OJPH_ENABLE_TIFF_SUPPORT
       else if (is_matching(".tif", v) || is_matching(".tiff", v))
       {
diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp
index 82bbe10..89b8127 100644
--- a/src/apps/others/ojph_img_io.cpp
+++ b/src/apps/others/ojph_img_io.cpp
@@ -247,7 +247,7 @@ namespace ojph {
     assert(fh == 0);
     fh = fopen(filename, "rb");
     if (fh == 0)
-      OJPH_ERROR(0x030000001, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000001, "Unable to open file %s", filename);
     fname = filename;
 
     // read magic number
@@ -255,27 +255,27 @@ namespace ojph {
     if (fread(t, 1, 2, fh) != 2)
     {
       close();
-      OJPH_ERROR(0x030000002, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000002, "Error reading file %s", filename);
     }
 
     // check magic number
     if (t[0] != 'P' || (t[1] != '5' && t[1] != '6'))
     {
       close();
-      OJPH_ERROR(0x030000003, "unknown file type for file %s", filename);
+      OJPH_ERROR(0x03000003, "unknown file type for file %s", filename);
     }
 
     size_t len = strlen(filename);
     if (t[1] == '5' && strncmp(filename + len - 4, ".pgm", 4) != 0)
     {
       close();
-      OJPH_ERROR(0x030000004, "wrong file extension, a file with "
+      OJPH_ERROR(0x03000004, "wrong file extension, a file with "
         "keyword P5 must have a .pgm extension for file %s", filename);
     }
     if (t[1] == '6' && strncmp(filename + len - 4, ".ppm", 4) != 0)
     {
       close();
-      OJPH_ERROR(0x030000005, "wrong file extension, a file with keyword P6 "
+      OJPH_ERROR(0x03000005, "wrong file extension, a file with keyword P6 "
         "must have a .ppm extension for file %s", filename);
     }
 
@@ -287,7 +287,7 @@ namespace ojph {
     if (fscanf(fh, "%d %d %d", &width, &height, &max_val) != 3)
     {
       close();
-      OJPH_ERROR(0x030000006, "error in file format for file %s", filename);
+      OJPH_ERROR(0x03000006, "error in file format for file %s", filename);
     }
     num_ele_per_line = num_comps * width;
     bytes_per_sample = max_val > 255 ? 2 : 1;
@@ -309,7 +309,7 @@ namespace ojph {
           temp_buf = malloc(temp_buf_byte_size);
         if (temp_buf == NULL) { // failed to allocate memory
           if (t) free(t); // the original buffer is still valid
-          OJPH_ERROR(0x030000007, "error allocating memory");
+          OJPH_ERROR(0x03000007, "error allocating memory");
         }
       }
       else
@@ -329,9 +329,9 @@ namespace ojph {
       return;
       
     if (bytes_per_sample == 1)
-      temp_buf = alloc_p->post_alloc_data<ui8>(num_comps * width, 0);
+      temp_buf = alloc_p->post_alloc_data<ui8>(num_comps * (size_t)width, 0);
     else
-      temp_buf = alloc_p->post_alloc_data<ui16>(num_comps * width, 0);
+      temp_buf = alloc_p->post_alloc_data<ui16>(num_comps * (size_t)width, 0);
   }
 
   /////////////////////////////////////////////////////////////////////////////
@@ -347,7 +347,7 @@ namespace ojph {
       if (result != num_ele_per_line)
       {
         close();
-        OJPH_ERROR(0x030000011, "not enough data in file %s", fname);
+        OJPH_ERROR(0x03000011, "not enough data in file %s", fname);
       }
       if (++cur_line >= height)
       {
@@ -394,21 +394,21 @@ namespace ojph {
         if (strncmp(".ppm", filename + len - 4, 4) == 0)
         {
           filename[len - 2] = 'g'; 
-          OJPH_WARN(0x03000001, "file was renamed %s\n", filename);
+          OJPH_WARN(0x03000021, "file was renamed %s\n", filename);
         }
         if (strncmp(".PPM", filename + len - 4, 4) == 0)
         {
           filename[len - 2] = 'G';
-          OJPH_WARN(0x03000002, "file was renamed %s\n", filename);
+          OJPH_WARN(0x03000022, "file was renamed %s\n", filename);
         }
       }
       fh = fopen(filename, "wb");
       if (fh == NULL)
-        OJPH_ERROR(0x030000021,
+        OJPH_ERROR(0x03000023,
           "unable to open file %s for writing", filename);
 
       fprintf(fh, "P5\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1);
-      buffer_size = width * bytes_per_sample;
+      buffer_size = (size_t)width * bytes_per_sample;
       buffer = (ui8*)malloc(buffer_size);
     }
     else
@@ -419,23 +419,23 @@ namespace ojph {
         if (strncmp(".pgm", filename + len - 4, 4) == 0)
         {
           filename[len - 2] = 'p';
-          OJPH_WARN(0x03000003, "file was renamed %s\n", filename);
+          OJPH_WARN(0x03000024, "file was renamed %s\n", filename);
         }
         if (strncmp(".PGM", filename + len - 4, 4) == 0)
         {
           filename[len - 2] = 'P';
-          OJPH_WARN(0x03000004, "file was renamed %s\n", filename);
+          OJPH_WARN(0x03000025, "file was renamed %s\n", filename);
         }
       }
       fh = fopen(filename, "wb");
       if (fh == NULL)
-        OJPH_ERROR(0x030000022,
+        OJPH_ERROR(0x03000026,
           "unable to open file %s for writing", filename);
       int result = //the number of written characters
         fprintf(fh, "P6\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1);
       if (result == 0)
-        OJPH_ERROR(0x030000023, "error writing to file %s", filename);
-      buffer_size = width * num_components * bytes_per_sample;
+        OJPH_ERROR(0x03000027, "error writing to file %s", filename);
+      buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample;
       buffer = (ui8*)malloc(buffer_size);
     }
     fname = filename;
@@ -448,7 +448,7 @@ namespace ojph {
   {
     assert(fh == NULL); //configure before opening
     if (num_components != 1 && num_components != 3)
-      OJPH_ERROR(0x030000031,
+      OJPH_ERROR(0x03000031,
         "ppm supports 3 colour components, while pgm supports 1");
     this->width = width;
     this->height = height;
@@ -530,12 +530,257 @@ namespace ojph {
       size_t result = fwrite(buffer,
                               bytes_per_sample, samples_per_line, fh);
       if (result != samples_per_line)
-        OJPH_ERROR(0x030000042, "error writing to file %s", fname);
+        OJPH_ERROR(0x03000041, "error writing to file %s", fname);
     }
     return 0;
   }
 
   ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+
+  /////////////////////////////////////////////////////////////////////////////
+  void pfm_in::open(const char *filename)
+  {
+    assert(fh == 0);
+    fh = fopen(filename, "rb");
+    if (fh == 0)
+      OJPH_ERROR(0x03000051, "Unable to open file %s", filename);
+    fname = filename;
+
+    // read magic number
+    char t[2];
+    if (fread(t, 1, 2, fh) != 2)
+    {
+      close();
+      OJPH_ERROR(0x03000052, "Error reading file %s", filename);
+    }
+
+    // check magic number
+    if (t[0] != 'P' || (t[1] != 'F' && t[1] != 'f'))
+    {
+      close();
+      OJPH_ERROR(0x03000053, "Unknown file type for file %s", filename);
+    }
+
+    // set number of components based on file-type
+    num_comps = t[1] == 'f' ? 1 : 3;
+    eat_white_spaces(fh);
+
+    // read width, height and max value in header
+    if (fscanf(fh, "%d %d", &width, &height) != 2)
+    {
+      close();
+      OJPH_ERROR(0x03000054, 
+        "Error reading width and height in file %s", filename);
+    }
+    eat_white_spaces(fh);
+
+    // little or big-endian
+    if (fscanf(fh, "%f", &scale) != 1)
+    {
+      close();
+      OJPH_ERROR(0x03000055, "Error reading scale in file %s", filename);
+    }
+    little_endian = scale < 0.0f;
+    scale = std::abs(scale);
+
+    fgetc(fh);
+    start_of_data = ojph_ftell(fh);
+
+    // alloc. linebuffer to hold a line of image data, if more than 1 comp.
+    if (temp_buf_byte_size < num_comps * (size_t)width * sizeof(float))
+    {
+      if (alloc_p == NULL)
+      {
+        temp_buf_byte_size = num_comps * (size_t)width * sizeof(float);
+        void* t = temp_buf;
+        if (temp_buf)
+          temp_buf = (float*)realloc(temp_buf, temp_buf_byte_size);
+        else
+          temp_buf = (float*)malloc(temp_buf_byte_size);
+        if (temp_buf == NULL) { // failed to allocate memory
+          if (t) free(t); // the original buffer is still valid
+          OJPH_ERROR(0x03000056, "Error allocating memory");
+        }
+      }
+      else
+      {
+        assert(temp_buf_byte_size == 0); //cannot reallocate the buffer
+        temp_buf_byte_size = num_comps * (size_t)width * sizeof(float);
+        alloc_p->pre_alloc_data<float>(temp_buf_byte_size, 0);
+      }
+    }
+    cur_line = 0;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  void pfm_in::finalize_alloc()
+  {
+    if (alloc_p == NULL)
+      return;
+    temp_buf = alloc_p->post_alloc_data<float>(num_comps * (size_t)width, 0);
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  ui32 pfm_in::read(const line_buf* line, ui32 comp_num)
+  {
+    assert(temp_buf_byte_size != 0 );
+    assert(fh != 0 && comp_num < num_comps);
+    assert(line->size >= width);
+
+    if (comp_num == 0)
+    {
+      si64 loc = start_of_data;
+      loc += (size_t)(height-1 - cur_line) * (size_t)num_comps 
+           * (size_t)width * sizeof(float);
+      if (ojph_fseek(fh, loc, SEEK_SET) != 0)
+      {
+        close();
+        OJPH_ERROR(0x03000061, "Error seeking in file %s", fname);
+      }
+      size_t result = 
+        fread(temp_buf, sizeof(float), (size_t)num_comps * (size_t)width, fh);
+      if (result != (size_t)num_comps * (size_t)width)
+      {
+        close();
+        OJPH_ERROR(0x03000062, "Not enough data in file %s", fname);
+      }
+      if (++cur_line >= height)
+        cur_line = 0;
+    }
+
+    union {
+      si32* s;
+      ui32* u;
+      float* f;
+    } sp, dp;
+
+    if (little_endian)
+    {
+      ui32 shift = 32 - bit_depth[comp_num];
+      sp.f = temp_buf + comp_num;
+      dp.f = line->f32;
+      if (shift)
+        for (ui32 i = width; i > 0; --i, sp.f += num_comps) 
+        {
+          si32 s = *sp.s;
+          s >>= shift;
+          *dp.s++ = s;
+        }
+      else
+        for (ui32 i = width; i > 0; --i, sp.f += num_comps)
+          *dp.f++ = *sp.f;
+    }
+    else {
+      ui32 shift = 32 - bit_depth[comp_num];
+      sp.f = temp_buf + comp_num;
+      dp.f = line->f32;
+      if (shift)
+        for (ui32 i = width; i > 0; --i, sp.f += num_comps) {
+          ui32 u = be2le(*sp.u);
+          si32 s = *(si32*)&u;
+          s >>= shift;
+          *dp.s++ = s;
+        }
+      else
+        for (ui32 i = width; i > 0; --i, sp.f += num_comps)
+          *dp.u++ = be2le(*sp.u);
+    }
+
+    return width;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////////////////////////////////////////////////////
+  void pfm_out::open(char* filename)
+  {
+    assert(fh == NULL && buffer == NULL);
+    fh = fopen(filename, "wb");
+    if (fh == NULL)
+      OJPH_ERROR(0x03000071,
+        "Unable to open file %s for writing", filename);
+    int result = //the number of written characters
+      fprintf(fh, "P%c\n%d %d\n%f\n", 
+        num_components > 1 ? 'F' : 'f', width, height, scale);
+    if (result == 0)
+      OJPH_ERROR(0x03000072, "error writing to file %s", filename);
+    buffer_size = (size_t)width * num_components * sizeof(float);
+    buffer = (float*)malloc(buffer_size);
+    fname = filename;
+    cur_line = 0;
+    start_of_data = ojph_ftell(fh);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  void pfm_out::configure(ui32 width, ui32 height, ui32 num_components, 
+                          float scale, ui32* bit_depth)
+  {
+    assert(fh == NULL); //configure before opening
+    if (num_components != 1 && num_components != 3)
+      OJPH_ERROR(0x03000081,
+        "pfm supports 1 or 3 colour components, not %d", num_components);
+    this->width = width;
+    this->height = height;
+    this->num_components = num_components;
+    this->scale = scale < 0.0f ? scale : -scale;
+    for (ui32 c = 0; c < num_components; ++c)
+      this->bit_depth[c] = bit_depth[c];
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  ui32 pfm_out::write(const line_buf* line, ui32 comp_num)
+  {
+    assert(fh);
+
+    ui32 shift = 32 - bit_depth[comp_num];
+    union {
+      ui32* u;
+      float* f;
+    } sp, dp;
+
+    dp.f = buffer + comp_num;
+    sp.f = line->f32;
+
+    if (shift)
+      for (ui32 i = width; i > 0; --i, dp.f += num_components, ++sp.f)
+      {
+        ui32 u = *sp.u;
+        u <<= shift;
+        *dp.u = u;
+      }
+    else
+      for (ui32 i = width; i > 0; --i, dp.f += num_components)
+        *dp.f = *sp.f++;
+
+    if (comp_num == num_components - 1)
+    {
+      size_t samples_per_line = num_components * (size_t)width;
+      si64 loc = start_of_data;
+      loc += (height - 1 - cur_line)* samples_per_line * sizeof(float);
+      if (ojph_fseek(fh, loc, SEEK_SET) != 0)
+        OJPH_ERROR(0x03000082, "Error seeking in file %s", fname);
+      size_t result = fwrite(buffer, sizeof(float), samples_per_line, fh);
+      if (result != samples_per_line)
+        OJPH_ERROR(0x03000083, "error writing to file %s", fname);
+      ++cur_line;
+    }
+
+    return 0;
+  }
+
+ ////////////////////////////////////////////////////////////////////////////
  //
  //
  //
@@ -548,7 +793,7 @@ namespace ojph {
   {
     tiff_handle = NULL;
     if ((tiff_handle = TIFFOpen(filename, "r")) == NULL)
-      OJPH_ERROR(0x0300000B1, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000091, "Unable to open file %s", filename);
     fname = filename;
 
     ui32 tiff_width = 0;
@@ -588,7 +833,7 @@ namespace ojph {
     // allocate linebuffer to hold a line of image data
     line_buffer = malloc(bytes_per_line);
     if (NULL == line_buffer)
-      OJPH_ERROR(0x0300000B2, "Unable to allocate %d bytes for line_buffer[] "
+      OJPH_ERROR(0x03000092, "Unable to allocate %d bytes for line_buffer[] "
         "for file %s", bytes_per_line, filename);
       
     cur_line = 0;
@@ -596,7 +841,7 @@ namespace ojph {
     // Error on known incompatilbe input formats
     if( tiff_bits_per_sample != 8 && tiff_bits_per_sample != 16 )
     {
-      OJPH_ERROR(0x0300000B3, "\nTIFF IO is currently limited"
+      OJPH_ERROR(0x03000093, "\nTIFF IO is currently limited"
         " to files with TIFFTAG_BITSPERSAMPLE=8 and TIFFTAG_BITSPERSAMPLE=16 \n"
         "input file = %s has TIFFTAG_BITSPERSAMPLE=%d", 
         filename, tiff_bits_per_sample);
@@ -604,14 +849,14 @@ namespace ojph {
 
     if( TIFFIsTiled( tiff_handle ) )
     {
-      OJPH_ERROR(0x0300000B4, "\nTIFF IO is currently limited to TIF files "
+      OJPH_ERROR(0x03000094, "\nTIFF IO is currently limited to TIF files "
         "without tiles. \nInput file %s has been detected as tiled", filename);
     }
 
     if(PHOTOMETRIC_RGB != tiff_photometric && 
        PHOTOMETRIC_MINISBLACK != tiff_photometric )
     {
-      OJPH_ERROR(0x0300000B5, "\nTIFF IO is currently limited to "
+      OJPH_ERROR(0x03000095, "\nTIFF IO is currently limited to "
         "TIFFTAG_PHOTOMETRIC=PHOTOMETRIC_MINISBLACK=%d and "
         "PHOTOMETRIC_RGB=%d. \nInput file %s has been detected "
         "TIFFTAG_PHOTOMETRIC=%d", 
@@ -620,7 +865,7 @@ namespace ojph {
 
     if( tiff_samples_per_pixel > 4 )
     {
-      OJPH_ERROR(0x0300000B6, "\nTIFF IO is currently limited to "
+      OJPH_ERROR(0x03000096, "\nTIFF IO is currently limited to "
         "TIFFTAG_SAMPLESPERPIXEL=4 \nInput file %s has been detected with "
         "TIFFTAG_SAMPLESPERPIXEL=%d",
         filename, tiff_samples_per_pixel);
@@ -642,7 +887,7 @@ namespace ojph {
       line_buffer_for_planar_support_uint8 = 
         (uint8_t*)calloc(width, sizeof(uint8_t));
       if (NULL == line_buffer_for_planar_support_uint8)
-        OJPH_ERROR(0x0300000B7, "Unable to allocate %d bytes for "
+        OJPH_ERROR(0x03000097, "Unable to allocate %d bytes for "
           "line_buffer_for_planar_support_uint8[] for file %s", 
           width * sizeof(uint8_t), filename);
     }
@@ -652,7 +897,7 @@ namespace ojph {
       line_buffer_for_planar_support_uint16 = 
         (uint16_t*)calloc(width, sizeof(uint16_t));
       if (NULL == line_buffer_for_planar_support_uint16)
-        OJPH_ERROR(0x0300000B8, "Unable to allocate %d bytes for "
+        OJPH_ERROR(0x03000098, "Unable to allocate %d bytes for "
           "line_buffer_for_planar_support_uint16[] for file %s", 
           width * sizeof(uint16_t), filename);
     }
@@ -664,7 +909,7 @@ namespace ojph {
   void tif_in::set_bit_depth(ui32 num_bit_depths, ui32* bit_depth)
   {
     if (num_bit_depths < 1)
-      OJPH_ERROR(0x030000B9, "one or more bit_depths must be provided");
+      OJPH_ERROR(0x030000A1, "one or more bit_depths must be provided");
     ui32 last_bd_idx = 0;
     for (ui32 i = 0; i < 4; ++i)
     {
@@ -673,7 +918,7 @@ namespace ojph {
 
       if (bd > 32 || bd < 1)
       {
-        OJPH_ERROR(0x0300000BA, 
+        OJPH_ERROR(0x030000A2, 
           "bit_depth = %d, this must be an integer from 1-32", bd);
       }
       this->bit_depth[i] = bd;
@@ -690,12 +935,12 @@ namespace ojph {
     // the first time trying to access this line
     if (PLANARCONFIG_SEPARATE == planar_configuration && 0 == comp_num )
     {
-      for (unsigned short color = 0; color < num_comps; color++)
+      for (ui32 color = 0; color < num_comps; color++)
       {
         if (bytes_per_sample == 1)
         {
           TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint8, 
-            cur_line, color);
+            cur_line, (ui16)color);
           ui32 x = color;
           uint8_t* line_buffer_of_interleaved_components = 
             (uint8_t*)line_buffer;
@@ -708,7 +953,7 @@ namespace ojph {
         else if (bytes_per_sample == 2)
         {
           TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint16, 
-            cur_line, color);
+            cur_line, (ui16)color);
           ui32 x = color;
           ui16* line_buffer_of_interleaved_components = (ui16*)line_buffer;
           for (ui32 i = 0; i < width; i++, x += num_comps)
@@ -809,23 +1054,23 @@ namespace ojph {
     }
     if (max_bitdepth > 16)
     {
-      OJPH_WARN(0x0300000C2, "TIFF output is currently limited to files "
+      OJPH_WARN(0x030000B1, "TIFF output is currently limited to files "
         "with max_bitdepth = 16, the source codestream has max_bitdepth=%d"
         ", the decoded data will be truncated to 16 bits", max_bitdepth);
     }
     if (num_components > 4)
     {
-      OJPH_ERROR(0x0300000C3, "TIFF IO is currently limited to files with "
+      OJPH_ERROR(0x030000B2, "TIFF IO is currently limited to files with "
         "num_components=1 to 4");
     }
 
     assert(tiff_handle == NULL && buffer == NULL);
     if ((tiff_handle = TIFFOpen(filename, "w")) == NULL)
     {
-      OJPH_ERROR(0x0300000C1, "unable to open file %s for writing", filename);
+      OJPH_ERROR(0x030000B3, "unable to open file %s for writing", filename);
     }
 
-    buffer_size = width * num_components * bytes_per_sample;
+    buffer_size = width * (size_t)num_components * (size_t)bytes_per_sample;
     buffer = (ui8*)malloc(buffer_size);
     fname = filename;
     cur_line = 0;
@@ -901,7 +1146,7 @@ namespace ojph {
       bytes_per_sample = 2;
     }
     samples_per_line = num_components * width;
-    bytes_per_line = bytes_per_sample * samples_per_line;
+    bytes_per_line = bytes_per_sample * (size_t)samples_per_line;
 
   }
 
@@ -1014,7 +1259,7 @@ namespace ojph {
       {
         int result = TIFFWriteScanline(tiff_handle, buffer, cur_line++);
         if (result != 1)
-          OJPH_ERROR(0x0300000C4, "error writing to file %s", fname);
+          OJPH_ERROR(0x030000C1, "error writing to file %s", fname);
       }
     return 0;
   }
@@ -1034,7 +1279,7 @@ namespace ojph {
     assert(fh == NULL);
     fh = fopen(filename, "rb");
     if (fh == 0)
-      OJPH_ERROR(0x03000051, "Unable to open file %s", filename);
+      OJPH_ERROR(0x030000D1, "Unable to open file %s", filename);
 
     //need to extract info from filename
 
@@ -1062,7 +1307,7 @@ namespace ojph {
     if (result != width[comp_num])
     {
       close();
-      OJPH_ERROR(0x03000061, "not enough data in file %s", fname);
+      OJPH_ERROR(0x030000E1, "not enough data in file %s", fname);
     }
 
     if (bytes_per_sample[comp_num] == 1)
@@ -1088,11 +1333,11 @@ namespace ojph {
                              ui32 num_downsamplings, const point *subsampling)
   {
     if (num_components != 1 && num_components !=3)
-      OJPH_ERROR(0x03000071, "yuv_in support 1 or 3 components");
+      OJPH_ERROR(0x030000F1, "yuv_in support 1 or 3 components");
     this->num_com = num_components;
 
     if (num_downsamplings < 1)
-      OJPH_ERROR(0x03000072, "one or more downsampling must be provided");
+      OJPH_ERROR(0x030000F2, "one or more downsampling must be provided");
 
     ui32 last_downsamp_idx = 0;
     for (ui32 i = 0; i < num_components; ++i)
@@ -1114,7 +1359,7 @@ namespace ojph {
   void yuv_in::set_bit_depth(ui32 num_bit_depths, ui32* bit_depth)
   {
     if (num_bit_depths < 1)
-      OJPH_ERROR(0x03000081, "one or more bit_depths must be provided");
+      OJPH_ERROR(0x03000101, "one or more bit_depths must be provided");
     ui32 last_bd_idx = 0;
     for (ui32 i = 0; i < 3; ++i)
     {
@@ -1156,7 +1401,7 @@ namespace ojph {
     assert(fh == NULL); //configure before open
     fh = fopen(filename, "wb");
     if (fh == 0)
-      OJPH_ERROR(0x03000091, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000111, "Unable to open file %s", filename);
     fname = filename;
   }
 
@@ -1199,7 +1444,7 @@ namespace ojph {
         *dp++ = (ui16)val;
       }
       if (fwrite(buffer, 2, w, fh) != w)
-        OJPH_ERROR(0x030000A1, "unable to write to file %s", fname);
+        OJPH_ERROR(0x03000121, "unable to write to file %s", fname);
     }
     else
     {
@@ -1213,7 +1458,7 @@ namespace ojph {
         *dp++ = (ui8)val;
       }
       if (fwrite(buffer, 1, w, fh) != w)
-        OJPH_ERROR(0x030000A2, "unable to write to file %s", fname);
+        OJPH_ERROR(0x03000122, "unable to write to file %s", fname);
     }
 
     return w;
@@ -1233,11 +1478,11 @@ namespace ojph {
     assert(fh == NULL);
     fh = fopen(filename, "rb");
     if (fh == NULL)
-      OJPH_ERROR(0x030000C1, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000131, "Unable to open file %s", filename);
 
     cur_line = 0;
     bytes_per_sample = (bit_depth + 7) >> 3;
-    buffer_size = width * bytes_per_sample;
+    buffer_size = (size_t)width * bytes_per_sample;
     buffer = (ui8*)malloc(buffer_size);
     fname = filename;
   }
@@ -1251,7 +1496,7 @@ namespace ojph {
     if (result != width)
     {
       close();
-      OJPH_ERROR(0x030000C2, "not enough data in file %s", fname);
+      OJPH_ERROR(0x03000132, "not enough data in file %s", fname);
     }
 
     if (bytes_per_sample > 3)
@@ -1360,7 +1605,7 @@ namespace ojph {
     assert(fh == NULL); //configure before open
     fh = fopen(filename, "wb");
     if (fh == 0)
-      OJPH_ERROR(0x03000091, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000141, "Unable to open file %s", filename);
     fname = filename;
   }
 
@@ -1373,11 +1618,11 @@ namespace ojph {
     this->width = width;
 
     if (is_signed) { 
-      upper_val = (1 << (bit_depth - 1));
-      lower_val = -(1 << (bit_depth - 1));
+      upper_val = ((si64)1 << (bit_depth - 1));
+      lower_val = -((si64)1 << (bit_depth - 1));
     } else {
-      upper_val = 1 << bit_depth;
-      lower_val = 0;
+      upper_val = (si64)1 << bit_depth;
+      lower_val = (si64)0;
     }
 
     bytes_per_sample = (bit_depth + 7) >> 3;
@@ -1392,63 +1637,127 @@ namespace ojph {
     assert(fh);
     assert(comp_num == 0);
 
-    if (bytes_per_sample > 3)
+    if (is_signed) 
     {
-      const si32* sp = line->i32;
-      ui32* dp = (ui32*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      if (bytes_per_sample > 3)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp++ = (ui32)val;
+        const si32* sp = line->i32;
+        si32* dp = (si32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (si32)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000151, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x030000B1, "unable to write to file %s", fname);
-    }
-    else if (bytes_per_sample > 2)
-    {
-      const si32* sp = line->i32;
-      ui32* dp = (ui32*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      else if (bytes_per_sample > 2)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp = (ui32)val;
-        // this only works for little endian architecture
-        dp = (ui32*)((ui8*)dp + 3);
+        const si32* sp = line->i32;
+        si32* dp = (si32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp = (si32)val;
+          // this only works for little endian architecture
+          dp = (si32*)((ui8*)dp + 3);
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000152, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x030000B2, "unable to write to file %s", fname);
-    }
-    else if (bytes_per_sample > 1)
-    {
-      const si32* sp = line->i32;
-      ui16* dp = (ui16*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      else if (bytes_per_sample > 1)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp++ = (ui16)val;
+        const si32* sp = line->i32;
+        si16* dp = (si16*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (si16)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000153, "unable to write to file %s", fname);
+      }
+      else
+      {
+        const si32* sp = line->i32;
+        si8* dp = (si8*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (si8)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000154, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x030000B3, "unable to write to file %s", fname);
     }
-    else
+    else 
     {
-      const si32* sp = line->i32;
-      ui8* dp = (ui8*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      if (bytes_per_sample > 3)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp++ = (ui8)val;
+        const ui32* sp = (ui32*)line->i32;
+        ui32* dp = (ui32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (ui32)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000155, "unable to write to file %s", fname);
+      }
+      else if (bytes_per_sample > 2)
+      {
+        const ui32* sp = (ui32*)line->i32;
+        ui32* dp = (ui32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp = (ui32)val;
+          // this only works for little endian architecture
+          dp = (ui32*)((ui8*)dp + 3);
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000156, "unable to write to file %s", fname);
+      }
+      else if (bytes_per_sample > 1)
+      {
+        const ui32* sp = (ui32*)line->i32;
+        ui16* dp = (ui16*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (ui16)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000157, "unable to write to file %s", fname);
+      }
+      else
+      {
+        const ui32* sp = (ui32*)line->i32;
+        ui8* dp = (ui8*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (ui8)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000158, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x030000B4, "unable to write to file %s", fname);
     }
 
     return width;
@@ -1470,7 +1779,7 @@ namespace ojph {
     assert(file_handle == 0);
     file_handle = fopen(filename, "rb");
     if (0 == file_handle)
-      OJPH_ERROR(0x0300000D1, "Unable to open file %s", filename);
+      OJPH_ERROR(0x03000161, "Unable to open file %s", filename);
     fname = filename;
 
     // read magic number
@@ -1478,7 +1787,7 @@ namespace ojph {
     if (fread(&magic_number, sizeof(ui32), 1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D2, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000162, "Error reading file %s", filename);
     }
 
     // check magic number
@@ -1497,7 +1806,7 @@ namespace ojph {
     else
     {
       close();
-      OJPH_ERROR(0x0300000D3, "Error reading file %s - this does not appear "
+      OJPH_ERROR(0x03000163, "Error reading file %s - this does not appear "
         "to be a valid DPX file.  It has magic number = 0x%08X.  The magic "
         "number of a DPX file is 0x%08X.", filename, magic_number, 
         dpx_magic_number);
@@ -1508,7 +1817,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D4, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000164, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       offset_to_image_data_in_bytes = be2le(offset_to_image_data_in_bytes);
@@ -1516,14 +1825,14 @@ namespace ojph {
     if (fread(version, sizeof(uint8_t), 8, file_handle) != 8)
     {
       close();
-      OJPH_ERROR(0x0300000D5, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000165, "Error reading file %s", filename);
     }
     // read image file size in bytes
     if (fread(&total_image_file_size_in_bytes, sizeof(ui32), 1, file_handle) 
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D6, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000166, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       total_image_file_size_in_bytes = be2le(total_image_file_size_in_bytes);
@@ -1532,14 +1841,14 @@ namespace ojph {
     if (fseek(file_handle,768, SEEK_SET) != 0)
     {
       close();
-      OJPH_ERROR(0x0300000D7, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000167, "Error reading file %s", filename);
     }
 
     // read image_orientation
     if (fread(&image_orientation, sizeof(uint16_t), 1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D8, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000168, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       image_orientation = be2le(image_orientation);
@@ -1549,7 +1858,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000D9, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000169, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       number_of_image_elements = be2le(number_of_image_elements);
@@ -1558,7 +1867,7 @@ namespace ojph {
     if (fread(&pixels_per_line, sizeof(ui32), 1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000DA, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016A, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       pixels_per_line = be2le(pixels_per_line);
@@ -1567,7 +1876,7 @@ namespace ojph {
     if (fread(&lines_per_image_element, sizeof(ui32), 1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000DB, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016B, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       lines_per_image_element = be2le(lines_per_image_element);
@@ -1576,7 +1885,7 @@ namespace ojph {
     if (fseek(file_handle, 780, SEEK_SET) != 0)
     {
       close();
-      OJPH_ERROR(0x0300000DC, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016C, "Error reading file %s", filename);
     }
 
     // read data sign for image element
@@ -1584,7 +1893,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000DE, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016E, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       data_sign_for_image_element_1 = be2le(data_sign_for_image_element_1);
@@ -1593,7 +1902,7 @@ namespace ojph {
     if (fseek(file_handle, 800, SEEK_SET) != 0)
     {
       close();
-      OJPH_ERROR(0x0300000DF, "Error reading file %s", filename);
+      OJPH_ERROR(0x0300016F, "Error reading file %s", filename);
     }
 
     // read descriptor
@@ -1601,7 +1910,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E0, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000170, "Error reading file %s", filename);
     }
 
     // read transfer characteristic
@@ -1609,7 +1918,7 @@ namespace ojph {
               1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E1, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000171, "Error reading file %s", filename);
     }
 
     // read colorimetric specification
@@ -1617,7 +1926,7 @@ namespace ojph {
         1, file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E2, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000172, "Error reading file %s", filename);
     }
 
     // read bit depth
@@ -1625,7 +1934,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E3, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000173, "Error reading file %s", filename);
     }
 
     // read packing
@@ -1633,7 +1942,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E4, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000174, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       packing_for_image_element_1 = be2le(packing_for_image_element_1);
@@ -1643,7 +1952,7 @@ namespace ojph {
         != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E5, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000175, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       encoding_for_image_element_1 = be2le(encoding_for_image_element_1);
@@ -1653,7 +1962,7 @@ namespace ojph {
               file_handle) != 1)
     {
       close();
-      OJPH_ERROR(0x0300000E6, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000176, "Error reading file %s", filename);
     }
     if (is_byte_swapping_necessary)
       offset_to_data_for_image_element_1 = 
@@ -1663,7 +1972,7 @@ namespace ojph {
     if (fseek(file_handle, (long)offset_to_image_data_in_bytes, SEEK_SET) != 0)
     {
       close();
-      OJPH_ERROR(0x0300000E7, "Error reading file %s", filename);
+      OJPH_ERROR(0x03000177, "Error reading file %s", filename);
     }
 
     // set ojph properties
@@ -1689,17 +1998,17 @@ namespace ojph {
     // allocate linebuffer to hold a line of image data from the file
     line_buffer = malloc(number_of_32_bit_words_per_line * sizeof(ui32) );
     if (NULL == line_buffer)
-      OJPH_ERROR(0x0300000E8, "Unable to allocate %d bytes for line_buffer[] "
+      OJPH_ERROR(0x03000178, "Unable to allocate %d bytes for line_buffer[] "
         "for file %s", 
         number_of_32_bit_words_per_line * sizeof(ui32), filename);
 
     // allocate line_buffer_16bit_samples to hold a line of image data in memory
     line_buffer_16bit_samples = 
-      (ui16*) malloc(width * num_comps * sizeof(ui16));
+      (ui16*) malloc((size_t)width * num_comps * sizeof(ui16));
     if (NULL == line_buffer_16bit_samples)
-      OJPH_ERROR(0x0300000E9, "Unable to allocate %d bytes for "
+      OJPH_ERROR(0x03000179, "Unable to allocate %d bytes for "
         "line_buffer_16bit_samples[] for file %s", 
-        width * num_comps * sizeof(ui16), filename);
+        (size_t)width * num_comps * sizeof(ui16), filename);
 
     cur_line = 0;
 
@@ -1719,7 +2028,7 @@ namespace ojph {
           file_handle) != number_of_32_bit_words_per_line)
       {
         close();
-        OJPH_ERROR(0x0300000F1, "Error reading file %s", fname);
+        OJPH_ERROR(0x03000181, "Error reading file %s", fname);
       }
 
       if (true == is_byte_swapping_necessary)
@@ -1773,7 +2082,7 @@ namespace ojph {
       }
       else
       {
-        OJPH_ERROR(0x0300000F2, "file %s uses DPX image formats that are not "
+        OJPH_ERROR(0x03000182, "file %s uses DPX image formats that are not "
           "yet supported by this software\n bitdepth_for_image_element_1 = "
           "%d\n num_comps=%d\npacking_for_image_element_1=%d\n "
           "descriptor_for_image_element_1=%d", fname, 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 555de0e..1c6856a 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -10,6 +10,7 @@ file(GLOB CODESTREAM_WASM  "codestream/*_wasm.cpp")
 file(GLOB CODING           "coding/*.cpp" "coding/*.h")
 file(GLOB CODING_SSSE3     "coding/*_ssse3.cpp")
 file(GLOB CODING_WASM      "coding/*_wasm.cpp")
+file(GLOB CODING_AVX2      "coding/*_avx2.cpp")
 file(GLOB CODING_AVX512    "coding/*_avx512.cpp")
 file(GLOB COMMON           "common/*.h")
 file(GLOB OTHERS           "others/*.cpp")
@@ -22,7 +23,7 @@ file(GLOB TRANSFORM_AVX512 "transform/*_avx512.cpp")
 file(GLOB TRANSFORM_WASM   "transform/*_wasm.cpp")
 
 list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM})
-list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX512})
+list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX2} ${CODING_AVX512})
 list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512} ${TRANSFORM_WASM})
 list(APPEND SOURCES ${CODESTREAM} ${CODING} ${COMMON} ${OTHERS} ${TRANSFORM})
 
@@ -70,9 +71,10 @@ else()
         source_group("transform" FILES ${TRANSFORM_AVX})
       endif()
       if (NOT OJPH_DISABLE_AVX2)
-        list(APPEND SOURCES ${CODESTREAM_AVX2} ${TRANSFORM_AVX2})
+        list(APPEND SOURCES ${CODESTREAM_AVX2} ${TRANSFORM_AVX2} ${CODING_AVX2})
         source_group("codestream" FILES ${CODESTREAM_AVX2})
         source_group("transform" FILES ${TRANSFORM_AVX2})
+        source_group("coding" FILES ${CODING_AVX2})
       endif()
       if ((NOT OJPH_DISABLE_AVX512) AND ("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64"))
         list(APPEND SOURCES ${CODING_AVX512} ${TRANSFORM_AVX512})
@@ -84,6 +86,8 @@ else()
       if (MSVC)
         set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX")
         set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+        set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+        set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
         set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512")
         set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX")
         set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
@@ -94,6 +98,8 @@ else()
         set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
         set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
         set_source_files_properties(coding/ojph_block_decoder_ssse3.cpp PROPERTIES COMPILE_FLAGS -mssse3)
+        set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+        set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
         set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512cd)
         set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
         set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
diff --git a/src/core/codestream/ojph_bitbuffer_write.h b/src/core/codestream/ojph_bitbuffer_write.h
index d5b6bca..ecb9dd2 100644
--- a/src/core/codestream/ojph_bitbuffer_write.h
+++ b/src/core/codestream/ojph_bitbuffer_write.h
@@ -109,33 +109,25 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void bb_put_zeros(bit_write_buf *bbp, int num_zeros,
+                      mem_elastic_allocator *elastic,
+                      coded_lists*& cur_coded_list, ui32& ph_bytes)
+    {
+      for (int i = num_zeros; i > 0; --i)
+        bb_put_bit(bbp, 0, elastic, cur_coded_list, ph_bytes);
+    }
+
     //////////////////////////////////////////////////////////////////////////
     static inline
     void bb_put_bits(bit_write_buf *bbp, ui32 data, int num_bits,
                      mem_elastic_allocator *elastic,
                      coded_lists*& cur_coded_list, ui32& ph_bytes)
     {
-//      assert(num_bits <= 32);
-      for (int i = num_bits - 1; i >= 0; --i)
+      assert(num_bits <= 32);
+      for (int i = num_bits - 1; i >= 0; --i) 
         bb_put_bit(bbp, data >> i, elastic, cur_coded_list, ph_bytes);
-//      while (num_bits) {
-//        int tx_bits = num_bits < bbp->avail_bits ? num_bits : bbp->avail_bits;
-//        bbp->tmp |= (data >> (num_bits - tx_bits)) & ((1 << tx_bits) - 1);
-//        bbp->avail_bits -= tx_bits;
-//        if (bbp->avail_bits <= 0)
-//        {
-//          bbp->avail_bits = 8 - (bbp->tmp != 0xFF ? 0 : 1);
-//          bbp->buf[bbp->buf_size - bbp->avail_size] = (ui8)(bbp->tmp & 0xFF);
-//          bbp->tmp = 0;
-//          --bbp->avail_size;
-//          if (bbp->avail_size == 0)
-//          {
-//            bb_expand_buf(bbp, elastic, cur_coded_list->next_list);
-//            cur_coded_list = cur_coded_list->next_list;
-//            ph_bytes += bit_buffer::needed;
-//          }
-//        }
-//      }
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp
index 9a63ca1..351284b 100644
--- a/src/core/codestream/ojph_codeblock.cpp
+++ b/src/core/codestream/ojph_codeblock.cpp
@@ -45,6 +45,7 @@
 #include "ojph_codestream_local.h"
 #include "ojph_codeblock.h"
 #include "ojph_subband.h"
+#include "ojph_resolution.h"
 
 namespace ojph {
 
@@ -52,7 +53,7 @@ namespace ojph {
   {
 
     //////////////////////////////////////////////////////////////////////////
-    void codeblock::pre_alloc(codestream *codestream,
+    void codeblock::pre_alloc(codestream *codestream, ui32 comp_num,
                               const size& nominal)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
@@ -60,7 +61,14 @@ namespace ojph {
       assert(byte_alignment / sizeof(ui32) > 1);
       const ui32 f = byte_alignment / sizeof(ui32) - 1;
       ui32 stride = (nominal.w + f) & ~f; // a multiple of 8
-      allocator->pre_alloc_data<ui32>(nominal.h * stride, 0);
+
+      const param_siz* sz = codestream->get_siz();
+      const param_cod* cd = codestream->get_cod(comp_num);
+      ui32 precision = cd->propose_implementation_precision(sz);
+      if (precision <= 32)
+        allocator->pre_alloc_data<ui32>(nominal.h * (size_t)stride, 0);
+      else
+        allocator->pre_alloc_data<ui64>(nominal.h * (size_t)stride, 0);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -75,7 +83,19 @@ namespace ojph {
       const ui32 f = byte_alignment / sizeof(ui32) - 1;
       this->stride = (nominal.w + f) & ~f; // a multiple of 8
       this->buf_size = this->stride * nominal.h;
-      this->buf = allocator->post_alloc_data<ui32>(this->buf_size, 0);
+
+      ui32 comp_num = parent->get_parent()->get_comp_num();
+      const param_siz* sz = codestream->get_siz();
+      const param_cod* cd = codestream->get_cod(comp_num);
+      ui32 bit_depth = cd->propose_implementation_precision(sz);
+      if (bit_depth <= 32) {
+        precision = BUF32;
+        this->buf32 = allocator->post_alloc_data<ui32>(this->buf_size, 0);
+      }
+      else {
+        precision = BUF64;
+        this->buf64 = allocator->post_alloc_data<ui64>(this->buf_size, 0);
+      }
 
       this->nominal_size = nominal;
       this->cb_size = cb_size;
@@ -85,8 +105,8 @@ namespace ojph {
       this->delta = parent->get_delta();
       this->delta_inv = 1.0f / this->delta;
       this->K_max = K_max;
-      for (int i = 0; i < 8; ++i)
-        this->max_val[i] = 0;
+      for (int i = 0; i < 4; ++i)
+        this->max_val64[i] = 0;
       ojph::param_cod cod = codestream->access_cod();
       this->reversible = cod.is_reversible();
       this->resilient = codestream->is_resilient();
@@ -100,28 +120,61 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codeblock::push(line_buf *line)
     {
-      // convert to sign and magnitude and keep max_val
-      const si32 *sp = line->i32 + line_offset;
-      ui32 *dp = buf + cur_line * stride;
-      this->codeblock_functions.tx_to_cb(sp, dp, K_max, delta_inv, cb_size.w, 
-        max_val);
-      ++cur_line;
+      // convert to sign and magnitude and keep max_val      
+      if (precision == BUF32)
+      {
+        assert(line->flags & line_buf::LFT_32BIT);
+        const si32 *sp = line->i32 + line_offset;
+        ui32 *dp = buf32 + cur_line * stride;
+        this->codeblock_functions.tx_to_cb32(sp, dp, K_max, delta_inv, 
+                                             cb_size.w, max_val32);
+        ++cur_line;
+      }
+      else 
+      {
+        assert(precision == BUF64);
+        assert(line->flags & line_buf::LFT_64BIT);
+        const si64 *sp = line->i64 + line_offset;
+        ui64 *dp = buf64 + cur_line * stride;
+        this->codeblock_functions.tx_to_cb64(sp, dp, K_max, delta_inv, 
+                                             cb_size.w, max_val64);
+        ++cur_line;
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
     void codeblock::encode(mem_elastic_allocator *elastic)
     {
-      ui32 mv = this->codeblock_functions.find_max_val(max_val);
-      if (mv >= 1u<<(31 - K_max))
+      if (precision == BUF32)
       {
-        coded_cb->missing_msbs = K_max - 1;
-        assert(coded_cb->missing_msbs > 0);
-        assert(coded_cb->missing_msbs < K_max);
-        coded_cb->num_passes = 1;
-        
-        this->codeblock_functions.encode_cb(buf, K_max-1, 1,
-          cb_size.w, cb_size.h, stride, coded_cb->pass_length,
-          elastic, coded_cb->next_coded);
+        ui32 mv = this->codeblock_functions.find_max_val32(max_val32);
+        if (mv >= 1u << (31 - K_max))
+        {
+          coded_cb->missing_msbs = K_max - 1;
+          assert(coded_cb->missing_msbs > 0);
+          assert(coded_cb->missing_msbs < K_max);
+          coded_cb->num_passes = 1;
+          
+          this->codeblock_functions.encode_cb32(buf32, K_max-1, 1,
+            cb_size.w, cb_size.h, stride, coded_cb->pass_length,
+            elastic, coded_cb->next_coded);
+        }
+      }
+      else
+      {
+        assert(precision == BUF64);
+        ui64 mv = this->codeblock_functions.find_max_val64(max_val64);
+        if (mv >= 1ULL << (63 - K_max))
+        {
+          coded_cb->missing_msbs = K_max - 1;
+          assert(coded_cb->missing_msbs > 0);
+          assert(coded_cb->missing_msbs < K_max);
+          coded_cb->num_passes = 1;
+          
+          this->codeblock_functions.encode_cb64(buf64, K_max-1, 1,
+            cb_size.w, cb_size.h, stride, coded_cb->pass_length,
+            elastic, coded_cb->next_coded);
+        }
       }
     }
 
@@ -132,8 +185,8 @@ namespace ojph {
       this->cb_size = cb_size;
       this->coded_cb = coded_cb;
       this->cur_line = 0;
-      for (int i = 0; i < 8; ++i)
-        this->max_val[i] = 0;
+      for (int i = 0; i < 4; ++i)
+        this->max_val64[i] = 0;
       this->zero_block = false;
     }
 
@@ -143,20 +196,33 @@ namespace ojph {
       if (coded_cb->pass_length[0] > 0 && coded_cb->num_passes > 0 &&
           coded_cb->next_coded != NULL)
       {
-        bool result = this->codeblock_functions.decode_cb(
+        bool result;
+        if (precision == BUF32)
+        {
+          result = this->codeblock_functions.decode_cb32(
+            coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size,
+            buf32, coded_cb->missing_msbs, coded_cb->num_passes,
+            coded_cb->pass_length[0], coded_cb->pass_length[1],
+            cb_size.w, cb_size.h, stride, stripe_causal);
+        }
+        else 
+        {
+          assert(precision == BUF64);
+          result = this->codeblock_functions.decode_cb64(
             coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size,
-            buf, coded_cb->missing_msbs, coded_cb->num_passes,
+            buf64, coded_cb->missing_msbs, coded_cb->num_passes,
             coded_cb->pass_length[0], coded_cb->pass_length[1],
             cb_size.w, cb_size.h, stride, stripe_causal);
+        }
 
         if (result == false)
         {
           if (resilient == true) {
-            OJPH_INFO(0x000300A1, "Error decoding a codeblock");
+            OJPH_INFO(0x000300A1, "Error decoding a codeblock.");
             zero_block = true;
           }
           else
-            OJPH_ERROR(0x000300A1, "Error decoding a codeblock");
+            OJPH_ERROR(0x000300A1, "Error decoding a codeblock.");
         }
       }
       else
@@ -167,15 +233,35 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codeblock::pull_line(line_buf *line)
     {
-      si32 *dp = line->i32 + line_offset;
-      if (!zero_block)
+      //convert to sign and magnitude
+      if (precision == BUF32)
       {
-        //convert to sign and magnitude
-        const ui32 *sp = buf + cur_line * stride;
-        this->codeblock_functions.tx_from_cb(sp, dp, K_max, delta, cb_size.w);
+        assert(line->flags & line_buf::LFT_32BIT);
+        si32 *dp = line->i32 + line_offset;
+        if (!zero_block)
+        {
+          const ui32 *sp = buf32 + cur_line * stride;
+          this->codeblock_functions.tx_from_cb32(sp, dp, K_max, delta, 
+                                                 cb_size.w);
+        }
+        else
+          this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(ui32));
       }
       else
-        this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp));
+      {
+        assert(precision == BUF64);
+        assert(line->flags & line_buf::LFT_64BIT);
+        si64 *dp = line->i64 + line_offset;
+        if (!zero_block)
+        {
+          const ui64 *sp = buf64 + cur_line * stride;
+          this->codeblock_functions.tx_from_cb64(sp, dp, K_max, delta, 
+                                                 cb_size.w);
+        }
+        else
+          this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp));
+      }
+
       ++cur_line;
       assert(cur_line <= cb_size.h);
     }
diff --git a/src/core/codestream/ojph_codeblock.h b/src/core/codestream/ojph_codeblock.h
index 2f7d8e7..fde8e6a 100644
--- a/src/core/codestream/ojph_codeblock.h
+++ b/src/core/codestream/ojph_codeblock.h
@@ -48,7 +48,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_elastic_allocator;
   class codestream;
   struct coded_lists;
@@ -65,8 +65,14 @@ namespace ojph {
     class codeblock
     {
       friend struct precinct;
+      enum : ui32 {
+        BUF32 = 4,
+        BUF64 = 8,
+      };
+
     public:
-      static void pre_alloc(codestream *codestream, const size& nominal);
+      static void pre_alloc(codestream *codestream, ui32 comp_num, 
+                            const size& nominal);
       void finalize_alloc(codestream *codestream, subband* parent,
                           const size& nominal, const size& cb_size,
                           coded_cb_header* coded_cb,
@@ -79,7 +85,11 @@ namespace ojph {
       void pull_line(line_buf *line);
 
     private:
-      ui32* buf;
+      ui32 precision;
+      union {
+        ui32* buf32;
+        ui64* buf64;
+      };
       size nominal_size;
       size cb_size;
       ui32 stride;
@@ -93,7 +103,10 @@ namespace ojph {
       bool resilient;
       bool stripe_causal;
       bool zero_block; // true when the decoded block is all zero
-      ui32 max_val[8]; // supports up to 256 bits
+      union {
+        ui32 max_val32[8]; // supports up to 256 bits
+        ui64 max_val64[4]; // supports up to 256 bits
+      };
       coded_cb_header* coded_cb;
       codeblock_fun codeblock_functions;
     };
diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp
index cf51530..08d8d73 100644
--- a/src/core/codestream/ojph_codeblock_fun.cpp
+++ b/src/core/codestream/ojph_codeblock_fun.cpp
@@ -63,72 +63,107 @@ namespace ojph {
     void wasm_mem_clear(void* addr, size_t count);
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 gen_find_max_val(ui32* address);
-    ui32 sse2_find_max_val(ui32* address);
-    ui32 avx2_find_max_val(ui32* address);
-    ui32 wasm_find_max_val(ui32* address);
+    ui32  gen_find_max_val32(ui32* address);
+    ui32 sse2_find_max_val32(ui32* address);
+    ui32 avx2_find_max_val32(ui32* address);
+    ui32 wasm_find_max_val32(ui32* address);
+    ui64  gen_find_max_val64(ui64* address);
+    ui64 sse2_find_max_val64(ui64* address);
+    ui64 avx2_find_max_val64(ui64* address);
+    ui64 wasm_find_max_val64(ui64* address);
+
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
+    void  gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void  gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+
+    void  gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
+    void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
+    void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
+    void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
+    void  gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void  gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
 
+    void  gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);                               
 
     void codeblock_fun::init(bool reversible) {
 
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
       // Default path, no acceleration.  We may change this later
-      decode_cb = ojph_decode_codeblock;
-      find_max_val = gen_find_max_val;
+      decode_cb32 = ojph_decode_codeblock32;
+      find_max_val32 = gen_find_max_val32;
       mem_clear = gen_mem_clear;
       if (reversible) {
-        tx_to_cb = gen_rev_tx_to_cb;
-        tx_from_cb = gen_rev_tx_from_cb;
+        tx_to_cb32 = gen_rev_tx_to_cb32;
+        tx_from_cb32 = gen_rev_tx_from_cb32;
       }
       else
       {
-        tx_to_cb = gen_irv_tx_to_cb;
-        tx_from_cb = gen_irv_tx_from_cb;
+        tx_to_cb32 = gen_irv_tx_to_cb32;
+        tx_from_cb32 = gen_irv_tx_from_cb32;
       }
-      encode_cb = ojph_encode_codeblock;
+      encode_cb32 = ojph_encode_codeblock32;
+
+      decode_cb64 = ojph_decode_codeblock64;
+      find_max_val64 = gen_find_max_val64;
+      if (reversible) {
+        tx_to_cb64 = gen_rev_tx_to_cb64;
+        tx_from_cb64 = gen_rev_tx_from_cb64;
+      }
+      else
+      {
+        tx_to_cb64 = NULL;
+        tx_from_cb64 = NULL;
+      }
+      encode_cb64 = ojph_encode_codeblock64;
 
   #ifndef OJPH_DISABLE_SIMD
 
     #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
-        // Accelerated functions for INTEL/AMD CPUs
+      // Accelerated functions for INTEL/AMD CPUs
       #ifndef OJPH_DISABLE_SSE
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE)
           mem_clear = sse_mem_clear;
@@ -136,21 +171,31 @@ namespace ojph {
 
       #ifndef OJPH_DISABLE_SSE2
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) {
-          find_max_val = sse2_find_max_val;
+          find_max_val32 = sse2_find_max_val32;
           if (reversible) {
-            tx_to_cb = sse2_rev_tx_to_cb;
-            tx_from_cb = sse2_rev_tx_from_cb;
+            tx_to_cb32 = sse2_rev_tx_to_cb32;
+            tx_from_cb32 = sse2_rev_tx_from_cb32;
           }
           else {
-            tx_to_cb = sse2_irv_tx_to_cb;
-            tx_from_cb = sse2_irv_tx_from_cb;
+            tx_to_cb32 = sse2_irv_tx_to_cb32;
+            tx_from_cb32 = sse2_irv_tx_from_cb32;
+          }
+          find_max_val64 = sse2_find_max_val64;
+          if (reversible) {
+            tx_to_cb64 = sse2_rev_tx_to_cb64;
+            tx_from_cb64 = sse2_rev_tx_from_cb64;
+          }
+          else
+          {
+            tx_to_cb64 = NULL;
+            tx_from_cb64 = NULL;
           }
         }
       #endif // !OJPH_DISABLE_SSE2
 
       #ifndef OJPH_DISABLE_SSSE3
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSSE3)
-          decode_cb = ojph_decode_codeblock_ssse3;
+          decode_cb32 = ojph_decode_codeblock_ssse3;
       #endif // !OJPH_DISABLE_SSSE3
 
       #ifndef OJPH_DISABLE_AVX
@@ -160,21 +205,39 @@ namespace ojph {
 
       #ifndef OJPH_DISABLE_AVX2
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) {
-          find_max_val = avx2_find_max_val;
+          decode_cb32 = ojph_decode_codeblock_avx2;
+          find_max_val32 = avx2_find_max_val32;
           if (reversible) {
-            tx_to_cb = avx2_rev_tx_to_cb;
-            tx_from_cb = avx2_rev_tx_from_cb;
+            tx_to_cb32 = avx2_rev_tx_to_cb32;
+            tx_from_cb32 = avx2_rev_tx_from_cb32;
           }
           else {
-            tx_to_cb = avx2_irv_tx_to_cb;
-            tx_from_cb = avx2_irv_tx_from_cb;
+            tx_to_cb32 = avx2_irv_tx_to_cb32;
+            tx_from_cb32 = avx2_irv_tx_from_cb32;
+          }
+          encode_cb32 = ojph_encode_codeblock_avx2;
+          bool result = initialize_block_encoder_tables_avx2();
+          assert(result); ojph_unused(result);
+
+          find_max_val64 = avx2_find_max_val64;
+          if (reversible) {
+            tx_to_cb64 = avx2_rev_tx_to_cb64;
+            tx_from_cb64 = avx2_rev_tx_from_cb64;
+          }
+          else
+          {
+            tx_to_cb64 = NULL;
+            tx_from_cb64 = NULL;
           }
         }
       #endif // !OJPH_DISABLE_AVX2
 
       #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512))
-        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512)
-          encode_cb = ojph_encode_codeblock_avx512;
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) {
+          encode_cb32 = ojph_encode_codeblock_avx512;
+          bool result = initialize_block_encoder_tables_avx512();
+          assert(result); ojph_unused(result);
+        }
       #endif // !OJPH_DISABLE_AVX512
 
     #elif defined(OJPH_ARCH_ARM)
@@ -186,18 +249,31 @@ namespace ojph {
 #else // OJPH_ENABLE_WASM_SIMD
 
       // Accelerated functions for WASM SIMD.
-      decode_cb = ojph_decode_codeblock_wasm;
-      find_max_val = wasm_find_max_val;
+      decode_cb32 = ojph_decode_codeblock_wasm;
+      find_max_val32 = wasm_find_max_val32;
       mem_clear = wasm_mem_clear;
       if (reversible) {
-        tx_to_cb = wasm_rev_tx_to_cb;
-        tx_from_cb = wasm_rev_tx_from_cb;
+        tx_to_cb32 = wasm_rev_tx_to_cb32;
+        tx_from_cb32 = wasm_rev_tx_from_cb32;
       }
       else {
-        tx_to_cb = wasm_irv_tx_to_cb;
-        tx_from_cb = wasm_irv_tx_from_cb;
+        tx_to_cb32 = wasm_irv_tx_to_cb32;
+        tx_from_cb32 = wasm_irv_tx_from_cb32;
+      }
+      encode_cb32 = ojph_encode_codeblock32;
+
+      decode_cb64 = ojph_decode_codeblock64;
+      find_max_val64 = wasm_find_max_val64;
+      if (reversible) {
+        tx_to_cb64 = wasm_rev_tx_to_cb64;
+        tx_from_cb64 = wasm_rev_tx_from_cb64;
+      }
+      else
+      {
+        tx_to_cb64 = NULL;
+        tx_from_cb64 = NULL;
       }
-      encode_cb = ojph_encode_codeblock;
+      encode_cb64 = ojph_encode_codeblock64;
 
 #endif // !OJPH_ENABLE_WASM_SIMD
 
diff --git a/src/core/codestream/ojph_codeblock_fun.h b/src/core/codestream/ojph_codeblock_fun.h
index 679b2d3..67fbc2b 100644
--- a/src/core/codestream/ojph_codeblock_fun.h
+++ b/src/core/codestream/ojph_codeblock_fun.h
@@ -51,23 +51,40 @@ namespace ojph {
     typedef void (*mem_clear_fun)(void* addr, size_t count);
 
     // define function signature for max value finding
-    typedef ui32 (*find_max_val_fun)(ui32* addr);
+    typedef ui32 (*find_max_val_fun32)(ui32* addr);
+
+    typedef ui64 (*find_max_val_fun64)(ui64* addr);
 
     // define line transfer function signature from subbands to codeblocks
-    typedef void (*tx_to_cb_fun)(const void *sp, ui32 *dp, ui32 K_max,
+    typedef void (*tx_to_cb_fun32)(const void *sp, ui32 *dp, ui32 K_max,
                                    float delta_inv, ui32 count, ui32* max_val);
 
+    typedef void (*tx_to_cb_fun64)(const void *sp, ui64 *dp, ui32 K_max,
+                                   float delta_inv, ui32 count, ui64* max_val);
+
     // define line transfer function signature from codeblock to subband
-    typedef void (*tx_from_cb_fun)(const ui32 *sp, void *dp, ui32 K_max,
+    typedef void (*tx_from_cb_fun32)(const ui32 *sp, void *dp, ui32 K_max,
+                                     float delta, ui32 count);
+
+    typedef void (*tx_from_cb_fun64)(const ui64 *sp, void *dp, ui32 K_max,
                                      float delta, ui32 count);
 
     // define the block decoder function signature
-    typedef bool (*cb_decoder_fun)(ui8* coded_data, ui32* decoded_data,
+    typedef bool (*cb_decoder_fun32)(ui8* coded_data, ui32* decoded_data,
+      ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
+      ui32 width, ui32 height, ui32 stride, bool stripe_causal);
+
+    typedef bool (*cb_decoder_fun64)(ui8* coded_data, ui64* decoded_data,
       ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
       ui32 width, ui32 height, ui32 stride, bool stripe_causal);
 
     // define the block encoder function signature
-    typedef void (*cb_encoder_fun)(ui32* buf, ui32 missing_msbs, 
+    typedef void (*cb_encoder_fun32)(ui32* buf, ui32 missing_msbs, 
+      ui32 num_passes, ui32 width, ui32 height, ui32 stride,
+      ui32* lengths, ojph::mem_elastic_allocator* elastic,
+      ojph::coded_lists*& coded);
+
+    typedef void (*cb_encoder_fun64)(ui64* buf, ui32 missing_msbs, 
       ui32 num_passes, ui32 width, ui32 height, ui32 stride,
       ui32* lengths, ojph::mem_elastic_allocator* elastic,
       ojph::coded_lists*& coded);
@@ -81,19 +98,24 @@ namespace ojph {
       mem_clear_fun mem_clear;
      
       // a pointer to the max value finding function
-      find_max_val_fun find_max_val;
+      find_max_val_fun32 find_max_val32;
+      find_max_val_fun64 find_max_val64;
      
       // a pointer to function transferring samples from subbands to codeblocks
-      tx_to_cb_fun tx_to_cb;
+      tx_to_cb_fun32 tx_to_cb32;
+      tx_to_cb_fun64 tx_to_cb64;
      
       // a pointer to function transferring samples from codeblocks to subbands
-      tx_from_cb_fun tx_from_cb;
+      tx_from_cb_fun32 tx_from_cb32;
+      tx_from_cb_fun64 tx_from_cb64;
      
       // a pointer to the decoder function
-      cb_decoder_fun decode_cb;
+      cb_decoder_fun32 decode_cb32;
+      cb_decoder_fun64 decode_cb64;
 
       // a pointer to the encoder function
-      cb_encoder_fun encode_cb;
+      cb_encoder_fun32 encode_cb32;
+      cb_encoder_fun64 encode_cb64;
     };
     
   }
diff --git a/src/core/codestream/ojph_codestream.cpp b/src/core/codestream/ojph_codestream.cpp
index 06f6b56..f2832ac 100644
--- a/src/core/codestream/ojph_codestream.cpp
+++ b/src/core/codestream/ojph_codestream.cpp
@@ -84,6 +84,12 @@ namespace ojph {
     return param_qcd(&state->qcd);
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  param_nlt codestream::access_nlt()
+  {
+    return param_nlt(&state->nlt);
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   void codestream::set_planar(bool planar)
   {
diff --git a/src/core/codestream/ojph_codestream_avx2.cpp b/src/core/codestream/ojph_codestream_avx2.cpp
index 04a81ed..a8e5138 100644
--- a/src/core/codestream/ojph_codestream_avx2.cpp
+++ b/src/core/codestream/ojph_codestream_avx2.cpp
@@ -35,6 +35,7 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include <climits>
 #include <immintrin.h>
 #include "ojph_defs.h"
 
@@ -42,7 +43,7 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 avx2_find_max_val(ui32* address)
+    ui32 avx2_find_max_val32(ui32* address)
     {
       __m128i x0 = _mm_loadu_si128((__m128i*)address);
       __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
@@ -56,14 +57,26 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                           float delta_inv, ui32 count, ui32* max_val)
+    ui64 avx2_find_max_val64(ui64* address)
+    {
+      __m128i x0 = _mm_loadu_si128((__m128i*)address);
+      __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
+      x0 = _mm_or_si128(x0, x1);
+      x1 = _mm_shuffle_epi32(x0, 0xEE);   // x1 = x0[2,3,2,3]
+      x0 = _mm_or_si128(x0, x1);
+      ui64 t = (ui64)_mm_extract_epi64(x0, 0);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(delta_inv);
 
       // convert to sign and magnitude and keep max_val      
       ui32 shift = 31 - K_max;
-      __m256i m0 = _mm256_set1_epi32((int)0x80000000);
+      __m256i m0 = _mm256_set1_epi32(INT_MIN);
       __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
       __m256i *p = (__m256i*)sp;
       for (ui32 i = 0; i < count; i += 8, p += 1, dp += 8)
@@ -78,16 +91,16 @@ namespace ojph {
       }
       _mm256_storeu_si256((__m256i*)max_val, tmax);
     }
-                           
+
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val)
+    void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(K_max);
 
       //quantize and convert to sign and magnitude and keep max_val
       __m256 d = _mm256_set1_ps(delta_inv);
-      __m256i m0 = _mm256_set1_epi32((int)0x80000000);
+      __m256i m0 = _mm256_set1_epi32(INT_MIN);
       __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
       float *p = (float*)sp;
       
@@ -106,29 +119,29 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
-      __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF);
+      __m256i m1 = _mm256_set1_epi32(INT_MAX);
       si32 *p = (si32*)dp;
       for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
       {
-          __m256i v = _mm256_load_si256((__m256i*)sp);
-          __m256i val = _mm256_and_si256(v, m1);
-          val = _mm256_srli_epi32(val, (int)shift);
-          val = _mm256_sign_epi32(val, v);
-          _mm256_storeu_si256((__m256i*)p, val);
+        __m256i v = _mm256_load_si256((__m256i*)sp);
+        __m256i val = _mm256_and_si256(v, m1);
+        val = _mm256_srli_epi32(val, (int)shift);
+        val = _mm256_sign_epi32(val, v);
+        _mm256_storeu_si256((__m256i*)p, val);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(K_max);
-      __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF);
+      __m256i m1 = _mm256_set1_epi32(INT_MAX);
       __m256 d = _mm256_set1_ps(delta);
       float *p = (float*)dp;
       for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
@@ -142,5 +155,58 @@ namespace ojph {
         _mm256_storeu_ps(p, valf);
       }
     }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val      
+      ui32 shift = 63 - K_max;
+      __m256i m0 = _mm256_set1_epi64x(LLONG_MIN);
+      __m256i zero = _mm256_setzero_si256();
+      __m256i one = _mm256_set1_epi64x(1);
+      __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
+      __m256i *p = (__m256i*)sp;
+      for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
+      {
+        __m256i v = _mm256_loadu_si256(p);
+        __m256i sign = _mm256_cmpgt_epi64(zero, v);
+        __m256i val = _mm256_xor_si256(v, sign);  // negate 1's complement
+        __m256i ones = _mm256_and_si256(sign, one);
+        val = _mm256_add_epi64(val, ones);        // 2's complement
+        sign = _mm256_and_si256(sign, m0);
+        val = _mm256_slli_epi64(val, (int)shift);
+        tmax = _mm256_or_si256(tmax, val);
+        val = _mm256_or_si256(val, sign);
+        _mm256_storeu_si256((__m256i*)dp, val);
+      }
+      _mm256_storeu_si256((__m256i*)max_val, tmax);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      
+      ui32 shift = 63 - K_max;
+      __m256i m1 = _mm256_set1_epi64x(LLONG_MAX);
+      __m256i zero = _mm256_setzero_si256();
+      __m256i one = _mm256_set1_epi64x(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
+      {
+        __m256i v = _mm256_load_si256((__m256i*)sp);
+        __m256i val = _mm256_and_si256(v, m1);
+        val = _mm256_srli_epi64(val, (int)shift);
+        __m256i sign = _mm256_cmpgt_epi64(zero, v);
+        val = _mm256_xor_si256(val, sign); // negate 1's complement
+        __m256i ones = _mm256_and_si256(sign, one);
+        val = _mm256_add_epi64(val, ones); // 2's complement
+        _mm256_storeu_si256((__m256i*)p, val);
+      }
+    }
   }
-}
\ No newline at end of file
+}
diff --git a/src/core/codestream/ojph_codestream_gen.cpp b/src/core/codestream/ojph_codestream_gen.cpp
index 466f483..cdc72c6 100644
--- a/src/core/codestream/ojph_codestream_gen.cpp
+++ b/src/core/codestream/ojph_codestream_gen.cpp
@@ -44,18 +44,21 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void gen_mem_clear(void* addr, size_t count)
     {
-      ui32* p = (ui32*)addr;
-      for (size_t i = 0; i < count; i += 4, p += 1)
-        *p = 0;
+      si64* p = (si64*)addr;
+      for (size_t i = 0; i < count; i += 8)
+        *p++ = 0;
     }
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 gen_find_max_val(ui32* addr) { return addr[0]; }
+    ui32 gen_find_max_val32(ui32* addr) { return addr[0]; }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                                     float delta_inv, ui32 count, 
-                                     ui32* max_val)
+    ui64 gen_find_max_val64(ui64* addr) { return addr[0]; }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                            float delta_inv, ui32 count, 
+                            ui32* max_val)
     {
       ojph_unused(delta_inv);
       ui32 shift = 31 - K_max;
@@ -65,7 +68,7 @@ namespace ojph {
       for (ui32 i = count; i > 0; --i)
       {
         si32 v = *p++;
-        ui32 sign = v >= 0 ? 0 : 0x80000000;
+        ui32 sign = v >= 0 ? 0U : 0x80000000U;
         ui32 val = (ui32)(v >= 0 ? v : -v);
         val <<= shift;
         *dp++ = sign | val;
@@ -75,9 +78,31 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                                     float delta_inv, ui32 count, 
-                                     ui32* max_val)
+    void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                            float delta_inv, ui32 count, 
+                            ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+      ui32 shift = 63 - K_max;
+      // convert to sign and magnitude and keep max_val
+      ui64 tmax = *max_val;
+      si64 *p = (si64*)sp;
+      for (ui32 i = count; i > 0; --i)
+      {
+        si64 v = *p++;
+        ui64 sign = v >= 0 ? 0ULL : 0x8000000000000000ULL;
+        ui64 val = (ui64)(v >= 0 ? v : -v);
+        val <<= shift;
+        *dp++ = sign | val;
+        tmax |= val; // it is more efficient to use or than max
+      }
+      *max_val = tmax;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                            float delta_inv, ui32 count, 
+                            ui32* max_val)
     {
       ojph_unused(K_max);
       //quantize and convert to sign and magnitude and keep max_val
@@ -87,7 +112,7 @@ namespace ojph {
       {
         float v = *p++;
         si32 t = ojph_trunc(v * delta_inv);
-        ui32 sign = t >= 0 ? 0 : 0x80000000;
+        ui32 sign = t >= 0 ? 0U : 0x80000000U;
         ui32 val = (ui32)(t >= 0 ? t : -t);
         *dp++ = sign | val;
         tmax |= val; // it is more efficient to use or than max
@@ -96,8 +121,8 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                                       float delta, ui32 count)
+    void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                              float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
@@ -106,14 +131,30 @@ namespace ojph {
       for (ui32 i = count; i > 0; --i)
       {
         ui32 v = *sp++;
-        si32 val = (v & 0x7FFFFFFF) >> shift;
-        *p++ = (v & 0x80000000) ? -val : val;
+        si32 val = (v & 0x7FFFFFFFU) >> shift;
+        *p++ = (v & 0x80000000U) ? -val : val;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                              float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      //convert to sign and magnitude
+      si64 *p = (si64*)dp;
+      for (ui32 i = count; i > 0; --i)
+      {
+        ui64 v = *sp++;
+        si64 val = (v & 0x7FFFFFFFFFFFFFFFULL) >> shift;
+        *p++ = (v & 0x8000000000000000ULL) ? -val : val;
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                                       float delta, ui32 count)
+    void gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                              float delta, ui32 count)
     {
       ojph_unused(K_max);
       //convert to sign and magnitude
@@ -121,8 +162,8 @@ namespace ojph {
       for (ui32 i = count; i > 0; --i)
       {
         ui32 v = *sp++;
-        float val = (float)(v & 0x7FFFFFFF) * delta;
-        *p++ = (v & 0x80000000) ? -val : val;
+        float val = (float)(v & 0x7FFFFFFFU) * delta;
+        *p++ = (v & 0x80000000U) ? -val : val;
       }
     }
     
diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp
index 8279466..7a114b7 100644
--- a/src/core/codestream/ojph_codestream_local.cpp
+++ b/src/core/codestream/ojph_codestream_local.cpp
@@ -550,6 +550,7 @@ namespace ojph {
       cod.update_atk(atk);
       qcd.check_validity(siz, cod);
       cap.check_validity(cod, qcd);
+      nlt.check_validity(siz);
       if (profile == OJPH_PN_IMF)
         check_imf_validity();
       else if (profile == OJPH_PN_BROADCAST)
@@ -632,6 +633,9 @@ namespace ojph {
       if (!qcd.write(file))
         OJPH_ERROR(0x00030026, "Error writing to file");
 
+      if (!nlt.write(file))
+        OJPH_ERROR(0x00030027, "Error writing to file");
+
       char buf[] = "      OpenJPH Ver "
         OJPH_INT_TO_STRING(OPENJPH_VERSION_MAJOR) "."
         OJPH_INT_TO_STRING(OPENJPH_VERSION_MINOR) "."
@@ -642,23 +646,23 @@ namespace ojph {
       //1 for General use (IS 8859-15:1999 (Latin) values)
       *(ui16*)(buf + 4) = swap_byte((ui16)(1)); 
       if (file->write(buf, len) != len)
-        OJPH_ERROR(0x00030027, "Error writing to file");
+        OJPH_ERROR(0x00030028, "Error writing to file");
 
       if (comments != NULL) {
         for (ui32 i = 0; i < num_comments; ++i)
         {
           t = swap_byte(JP2K_MARKER::COM);
           if (file->write(&t, 2) != 2)
-            OJPH_ERROR(0x00030028, "Error writing to file");
+            OJPH_ERROR(0x00030029, "Error writing to file");
           t = swap_byte((ui16)(comments[i].len + 4));
           if (file->write(&t, 2) != 2)
-            OJPH_ERROR(0x00030029, "Error writing to file");
+            OJPH_ERROR(0x0003002A, "Error writing to file");
           //1 for General use (IS 8859-15:1999 (Latin) values)
           t = swap_byte(comments[i].Rcom);
           if (file->write(&t, 2) != 2)
-            OJPH_ERROR(0x0003002A, "Error writing to file");
-          if (file->write(comments[i].data, comments[i].len)!=comments[i].len)
             OJPH_ERROR(0x0003002B, "Error writing to file");
+          if (file->write(comments[i].data, comments[i].len)!=comments[i].len)
+            OJPH_ERROR(0x0003002C, "Error writing to file");
         }
       }
     }
@@ -728,8 +732,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codestream::read_headers(infile_base *file)
     {
-      ui16 marker_list[19] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC,
-        RGN, POC, PPM, TLM, PLM, CRG, COM, DFS, ATK, SOT };
+      ui16 marker_list[20] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC,
+        RGN, POC, PPM, TLM, PLM, CRG, COM, DFS, ATK, NLT, SOT };
       find_marker(file, marker_list, 1); //find SOC
       find_marker(file, marker_list + 1, 1); //find SIZ
       siz.read(file);
@@ -737,7 +741,7 @@ namespace ojph {
       int received_markers = 0; //check that COD, & QCD received
       while (true)
       {
-        marker_idx = find_marker(file, marker_list + 2, 17);
+        marker_idx = find_marker(file, marker_list + 2, 18);
         if (marker_idx == 0)
           cap.read(file);
         else if (marker_idx == 1)
@@ -813,6 +817,8 @@ namespace ojph {
         else if (marker_idx == 15)
           atk[2].read(file);
         else if (marker_idx == 16)
+          nlt.read(file);
+        else if (marker_idx == 17)
           break;
         else
           OJPH_ERROR(0x00030051, "File ended before finding a tile segment");
@@ -902,19 +908,20 @@ namespace ojph {
             }
 
             bool sod_found = false;
-            ui16 other_tile_part_markers[6] = { SOT, POC, PPT, PLT, COM, SOD };
+            ui16 other_tile_part_markers[7] = { SOT, POC, PPT, PLT, COM, 
+              NLT, SOD };
             while (true)
             {
               int marker_idx = 0;
               int result = 0;
-              marker_idx = find_marker(infile, other_tile_part_markers + 1, 5);
+              marker_idx = find_marker(infile, other_tile_part_markers + 1, 6);
               if (marker_idx == 0)
                 result = skip_marker(infile, "POC",
-                  "POC in a tile is not supported yet",
+                  "POC marker segment in a tile is not supported yet",
                   OJPH_MSG_LEVEL::WARN, resilient);
               else if (marker_idx == 1)
                 result = skip_marker(infile, "PPT",
-                  "PPT in a tile is not supported yet",
+                  "PPT marker segment in a tile is not supported yet",
                   OJPH_MSG_LEVEL::WARN, resilient);
               else if (marker_idx == 2)
                 //Skipping PLT marker segment;this should not cause any issues
@@ -924,6 +931,10 @@ namespace ojph {
                 result = skip_marker(infile, "COM", NULL,
                   OJPH_MSG_LEVEL::NO_MSG, resilient);
               else if (marker_idx == 4)
+                result = skip_marker(infile, "NLT", 
+                  "NLT marker in tile is not supported yet",
+                  OJPH_MSG_LEVEL::WARN, resilient);
+              else if (marker_idx == 5)
               {
                 sod_found = true;
                 break;
@@ -961,40 +972,40 @@ namespace ojph {
           else
           { //first tile part
             bool sod_found = false;
-            ui16 first_tile_part_markers[11] = { SOT, COD, COC, QCD, QCC, RGN,
-              POC, PPT, PLT, COM, SOD };
+            ui16 first_tile_part_markers[12] = { SOT, COD, COC, QCD, QCC, RGN,
+              POC, PPT, PLT, COM, NLT, SOD };
             while (true)
             {
               int marker_idx = 0;
               int result = 0;
-              marker_idx = find_marker(infile, first_tile_part_markers+1, 10);
+              marker_idx = find_marker(infile, first_tile_part_markers+1, 11);
               if (marker_idx == 0)
                 result = skip_marker(infile, "COD",
-                  "COD in a tile is not supported yet",
+                  "COD marker segment in a tile is not supported yet",
                   OJPH_MSG_LEVEL::WARN, resilient);
               else if (marker_idx == 1)
                 result = skip_marker(infile, "COC",
-                  "COC in a tile is not supported yet",
+                  "COC marker segment in a tile is not supported yet",
                   OJPH_MSG_LEVEL::WARN, resilient);
               else if (marker_idx == 2)
                 result = skip_marker(infile, "QCD",
-                  "QCD in a tile is not supported yet",
+                  "QCD marker segment in a tile is not supported yet",
                   OJPH_MSG_LEVEL::WARN, resilient);
               else if (marker_idx == 3)
                 result = skip_marker(infile, "QCC",
-                  "QCC in a tile is not supported yet",
+                  "QCC marker segment in a tile is not supported yet",
                   OJPH_MSG_LEVEL::WARN, resilient);
               else if (marker_idx == 4)
                 result = skip_marker(infile, "RGN",
-                  "RGN in a tile is not supported yet",
+                  "RGN marker segment in a tile is not supported yet",
                   OJPH_MSG_LEVEL::WARN, resilient);
               else if (marker_idx == 5)
                 result = skip_marker(infile, "POC",
-                  "POC in a tile is not supported yet",
+                  "POC marker segment in a tile is not supported yet",
                   OJPH_MSG_LEVEL::WARN, resilient);
               else if (marker_idx == 6)
                 result = skip_marker(infile, "PPT",
-                  "PPT in a tile is not supported yet",
+                  "PPT marker segment in a tile is not supported yet",
                   OJPH_MSG_LEVEL::WARN, resilient);
               else if (marker_idx == 7)
                 //Skipping PLT marker segment;this should not cause any issues
@@ -1004,6 +1015,10 @@ namespace ojph {
                 result = skip_marker(infile, "COM", NULL,
                   OJPH_MSG_LEVEL::NO_MSG, resilient);
               else if (marker_idx == 9)
+                result = skip_marker(infile, "NLT", 
+                  "PPT marker segment in a tile is not supported yet",
+                  OJPH_MSG_LEVEL::WARN, resilient);
+              else if (marker_idx == 10)
               {
                 sod_found = true;
                 break;
diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h
index 8ca8c71..3d03658 100644
--- a/src/core/codestream/ojph_codestream_local.h
+++ b/src/core/codestream/ojph_codestream_local.h
@@ -46,7 +46,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_fixed_allocator;
   class mem_elastic_allocator;
   class codestream;
@@ -96,6 +96,8 @@ namespace ojph {
       }
       const param_dfs* access_dfs()
       { if (dfs.exists()) return &dfs; else return NULL; }
+      const param_nlt* get_nlt()
+      { return &nlt; }
       mem_fixed_allocator* get_allocator() { return allocator; }
       mem_elastic_allocator* get_elastic_alloc() { return elastic_alloc; }
       outfile_base* get_file() { return outfile; }
@@ -161,6 +163,7 @@ namespace ojph {
       param_cap cap;         // extended capabilities
       param_qcd qcd;         // quantization default
       param_tlm tlm;         // tile-part lengths
+      param_nlt nlt;         // non-linearity point transformation
 
     private: // this is to handle qcc and coc
       int used_qcc_fields;
diff --git a/src/core/codestream/ojph_codestream_sse.cpp b/src/core/codestream/ojph_codestream_sse.cpp
index 7c64ad9..6a31cbd 100644
--- a/src/core/codestream/ojph_codestream_sse.cpp
+++ b/src/core/codestream/ojph_codestream_sse.cpp
@@ -49,6 +49,5 @@ namespace ojph {
       for (size_t i = 0; i < count; i += 16, p += 4)
         _mm_storeu_ps(p, zero);
     }
-
   }
 }
\ No newline at end of file
diff --git a/src/core/codestream/ojph_codestream_sse2.cpp b/src/core/codestream/ojph_codestream_sse2.cpp
index 9bb0643..3352bcd 100644
--- a/src/core/codestream/ojph_codestream_sse2.cpp
+++ b/src/core/codestream/ojph_codestream_sse2.cpp
@@ -35,6 +35,7 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include <climits>
 #include <immintrin.h>
 #include "ojph_defs.h"
 
@@ -42,7 +43,7 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 sse2_find_max_val(ui32* address)
+    ui32 sse2_find_max_val32(ui32* address)
     {
       __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
       x1 = _mm_shuffle_epi32(x0, 0xEE);   // x1 = x0[2,3,2,3]
@@ -59,14 +60,29 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                           float delta_inv, ui32 count, ui32* max_val)
+    ui64 sse2_find_max_val64(ui64* address)
+    {
+      __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
+      x1 = _mm_shuffle_epi32(x0, 0xEE);   // x1 = x0[2,3,2,3]
+      x0 = _mm_or_si128(x0, x1);
+      _mm_storeu_si128((__m128i*)address, x0);
+      return *address;
+      // A single movd t, xmm0 can do the trick, but it is not available
+      // in SSE2 intrinsics. extract_epi32 is available in sse4.1
+      // ui32 t = (ui32)_mm_extract_epi16(x0, 0);
+      // t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
+      // return t;
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(delta_inv);
 
       // convert to sign and magnitude and keep max_val      
       ui32 shift = 31 - K_max;
-      __m128i m0 = _mm_set1_epi32((int)0x80000000);
+      __m128i m0 = _mm_set1_epi32(INT_MIN);
       __m128i zero = _mm_setzero_si128();
       __m128i one = _mm_set1_epi32(1);
       __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
@@ -88,8 +104,8 @@ namespace ojph {
     }
                            
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val)
+    void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(K_max);
 
@@ -118,34 +134,34 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
-      __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
+      __m128i m1 = _mm_set1_epi32(INT_MAX);
       __m128i zero = _mm_setzero_si128();
       __m128i one = _mm_set1_epi32(1);
       si32 *p = (si32*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
       {
-          __m128i v = _mm_load_si128((__m128i*)sp);
-          __m128i val = _mm_and_si128(v, m1);
-          val = _mm_srli_epi32(val, (int)shift);
-          __m128i sign = _mm_cmplt_epi32(v, zero);
-          val = _mm_xor_si128(val, sign); // negate 1's complement
-          __m128i ones = _mm_and_si128(sign, one);
-          val = _mm_add_epi32(val, ones); // 2's complement
-          _mm_storeu_si128((__m128i*)p, val);
+        __m128i v = _mm_load_si128((__m128i*)sp);
+        __m128i val = _mm_and_si128(v, m1);
+        val = _mm_srli_epi32(val, (int)shift);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        val = _mm_xor_si128(val, sign); // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi32(val, ones); // 2's complement
+        _mm_storeu_si128((__m128i*)p, val);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(K_max);
-      __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
+      __m128i m1 = _mm_set1_epi32(INT_MAX);
       __m128 d = _mm_set1_ps(delta);
       float *p = (float*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
@@ -159,5 +175,59 @@ namespace ojph {
         _mm_storeu_ps(p, valf);
       }
     }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val      
+      ui32 shift = 63 - K_max;
+      __m128i m0 = _mm_set1_epi64x(LLONG_MIN);
+      __m128i zero = _mm_setzero_si128();
+      __m128i one = _mm_set1_epi64x(1);
+      __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
+      __m128i *p = (__m128i*)sp;
+      for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2)
+      {
+        __m128i v = _mm_loadu_si128(p);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        sign = _mm_shuffle_epi32(sign, 0xF5);  // sign = sign[1,1,3,3];
+        __m128i val = _mm_xor_si128(v, sign);  // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi64(val, ones);        // 2's complement
+        sign = _mm_and_si128(sign, m0);
+        val = _mm_slli_epi64(val, (int)shift);
+        tmax = _mm_or_si128(tmax, val);
+        val = _mm_or_si128(val, sign);
+        _mm_storeu_si128((__m128i*)dp, val);
+      }
+      _mm_storeu_si128((__m128i*)max_val, tmax);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      __m128i m1 = _mm_set1_epi64x(LLONG_MAX);
+      __m128i zero = _mm_setzero_si128();
+      __m128i one = _mm_set1_epi64x(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
+      {
+        __m128i v = _mm_load_si128((__m128i*)sp);
+        __m128i val = _mm_and_si128(v, m1);
+        val = _mm_srli_epi64(val, (int)shift);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        sign = _mm_shuffle_epi32(sign, 0xF5);  // sign = sign[1,1,3,3];
+        val = _mm_xor_si128(val, sign); // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi64(val, ones); // 2's complement
+        _mm_storeu_si128((__m128i*)p, val);
+      }
+    }
   }
 }
\ No newline at end of file
diff --git a/src/core/codestream/ojph_codestream_wasm.cpp b/src/core/codestream/ojph_codestream_wasm.cpp
index 19e47aa..e2cd444 100644
--- a/src/core/codestream/ojph_codestream_wasm.cpp
+++ b/src/core/codestream/ojph_codestream_wasm.cpp
@@ -35,6 +35,7 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include <climits>
 #include <cstddef> 
 #include <wasm_simd128.h>
 
@@ -43,20 +44,17 @@
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-  #define REPEAT(a) a,a,a,a
-
     //////////////////////////////////////////////////////////////////////////
     void wasm_mem_clear(void* addr, size_t count)
     {
       float* p = (float*)addr;
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
+      v128_t zero = wasm_i32x4_splat(0);
       for (size_t i = 0; i < count; i += 16, p += 4)
         wasm_v128_store(p, zero);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 wasm_find_max_val(ui32* address)
+    ui32 wasm_find_max_val32(ui32* address)
     {
       v128_t x1, x0 = wasm_v128_load(address);
       x1 = wasm_i32x4_shuffle(x0, x0, 2, 3, 2, 3);   // x1 = x0[2,3,2,3]
@@ -68,19 +66,29 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                           float delta_inv, ui32 count, ui32* max_val)
+    ui64 wasm_find_max_val64(ui64* address)
+    {
+      v128_t x1, x0 = wasm_v128_load(address);
+      x1 = wasm_i64x2_shuffle(x0, x0, 1, 1);   // x1 = x0[2,3,2,3]
+      x0 = wasm_v128_or(x0, x1);
+      ui64 t = (ui64)wasm_i64x2_extract_lane(x0, 0);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(delta_inv);
 
       // convert to sign and magnitude and keep max_val      
       ui32 shift = 31 - K_max;
-      v128_t m0 = wasm_i32x4_const(REPEAT((int)0x80000000));
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
-      v128_t one = wasm_i32x4_const(REPEAT(1));
+      v128_t m0 = wasm_i32x4_splat(INT_MIN);
+      v128_t zero = wasm_i32x4_splat(0);
+      v128_t one = wasm_i32x4_splat(1);
       v128_t tmax = wasm_v128_load(max_val);
-      v128_t *p = (v128_t*)sp;
-      for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
+      si32 *p = (si32*)sp;
+      for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
       {
         v128_t v = wasm_v128_load(p);
         v128_t sign = wasm_i32x4_lt(v, zero);
@@ -97,16 +105,16 @@ namespace ojph {
     }
                            
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val)
+    void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(K_max);
 
       //quantize and convert to sign and magnitude and keep max_val
 
       v128_t d = wasm_f32x4_splat(delta_inv);
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
-      v128_t one = wasm_i32x4_const(REPEAT(1));
+      v128_t zero = wasm_i32x4_splat(0);
+      v128_t one = wasm_i32x4_splat(1);
       v128_t tmax = wasm_v128_load(max_val);
       float *p = (float*)sp;
       for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
@@ -127,14 +135,14 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
-      v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF));
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
-      v128_t one = wasm_i32x4_const(REPEAT(1));
+      v128_t m1 = wasm_i32x4_splat(INT_MAX);
+      v128_t zero = wasm_i32x4_splat(0);
+      v128_t one = wasm_i32x4_splat(1);
       si32 *p = (si32*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
       {
@@ -150,11 +158,11 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(K_max);
-      v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF));
+      v128_t m1 = wasm_i32x4_splat(INT_MAX);
       v128_t d = wasm_f32x4_splat(delta);
       float *p = (float*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
@@ -167,6 +175,58 @@ namespace ojph {
         valf = wasm_v128_or(valf, sign);
         wasm_v128_store(p, valf);
       }
-    }  
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val      
+      ui32 shift = 63 - K_max;
+      v128_t m0 = wasm_i64x2_splat(LLONG_MIN);
+      v128_t zero = wasm_i64x2_splat(0);
+      v128_t one = wasm_i64x2_splat(1);
+      v128_t tmax = wasm_v128_load(max_val);
+      si64 *p = (si64*)sp;
+      for (ui32 i = 0; i < count; i += 2, p += 2, dp += 2)
+      {
+        v128_t v = wasm_v128_load(p);
+        v128_t sign = wasm_i64x2_lt(v, zero);
+        v128_t val = wasm_v128_xor(v, sign); // negate 1's complement
+        v128_t ones = wasm_v128_and(sign, one);
+        val = wasm_i64x2_add(val, ones);     // 2's complement
+        sign = wasm_v128_and(sign, m0);
+        val = wasm_i64x2_shl(val, shift);
+        tmax = wasm_v128_or(tmax, val);
+        val = wasm_v128_or(val, sign);
+        wasm_v128_store(dp, val);
+      }
+      wasm_v128_store(max_val, tmax);
+    }   
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      v128_t m1 = wasm_i64x2_splat(LLONG_MAX);
+      v128_t zero = wasm_i64x2_splat(0);
+      v128_t one = wasm_i64x2_splat(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
+      {
+          v128_t v = wasm_v128_load((v128_t*)sp);
+          v128_t val = wasm_v128_and(v, m1);
+          val = wasm_i64x2_shr(val, shift);
+          v128_t sign = wasm_i64x2_lt(v, zero);
+          val = wasm_v128_xor(val, sign); // negate 1's complement
+          v128_t ones = wasm_v128_and(sign, one);
+          val = wasm_i64x2_add(val, ones); // 2's complement
+          wasm_v128_store(p, val);
+      }
+    }
   }
 }
\ No newline at end of file
diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index b6ada17..8a234e5 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -372,6 +372,27 @@ namespace ojph {
   //
   ////////////////////////////////////////////////////////////////////////////
 
+  //////////////////////////////////////////////////////////////////////////
+  void param_nlt::set_type3_transformation(ui32 comp_num, bool enable)
+  {
+    state->set_type3_transformation(comp_num, enable);
+  }
+
+  //////////////////////////////////////////////////////////////////////////
+  bool param_nlt::get_type3_transformation(ui32 comp_num, ui8& bit_depth,
+                                           bool& is_signed)
+  {
+    return state->get_type3_transformation(comp_num, bit_depth, is_signed);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  //
+  //
+  //
+  //
+  //
+  ////////////////////////////////////////////////////////////////////////////
+
   //////////////////////////////////////////////////////////////////////////
   void comment_exchange::set_string(const char* str)
   { 
@@ -611,7 +632,7 @@ namespace ojph {
       if ((Rsiz & 0x4000) == 0)
         OJPH_ERROR(0x00050044, 
           "Rsiz bit 14 is not set (this is not a JPH file)");
-      if ((Rsiz & 0x8000) != 0 && (Rsiz & 0xF5F) != 0)
+      if ((Rsiz & 0x8000) != 0 && (Rsiz & 0xD5F) != 0)
         OJPH_WARN(0x00050001, "Rsiz in SIZ has unimplemented fields");
       if (file->read(&Xsiz, 4) != 4)
         OJPH_ERROR(0x00050045, "error reading SIZ marker");
@@ -755,6 +776,25 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
+    //////////////////////////////////////////////////////////////////////////
+    ui32 
+    param_cod::propose_implementation_precision(const param_siz* siz) const
+    {
+      bool employing_color_transform = is_employing_color_transform() ? 1 : 0;
+      bool reversible = atk->is_reversible();
+
+      ui32 bit_depth = 32; 
+      if (reversible) {
+        bit_depth = siz->get_bit_depth(comp_num);
+        bit_depth += comp_num < 3 ? employing_color_transform : 0;
+        // 3 or 4 is how many extra bits are needed for the HH band at the 
+        // bottom most level of decomposition. 
+        bit_depth += get_num_decompositions() > 5 ? 4 : 3; 
+      }
+
+      return bit_depth;
+    }
+
     //////////////////////////////////////////////////////////////////////////
     bool param_cod::write(outfile_base *file)
     {
@@ -908,24 +948,46 @@ namespace ojph {
     void param_qcd::set_rev_quant(ui32 num_decomps, ui32 bit_depth,
                                   bool is_employing_color_transform)
     {
-      int guard_bits = 1;
-      Sqcd = (ui8)(guard_bits << 5); //one guard bit, and no quantization
       ui32 B = bit_depth;
       B += is_employing_color_transform ? 1 : 0; //1 bit for RCT
       int s = 0;
-      float bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true);
-      //we leave some leeway for numerical error by multiplying by 1.1f
-      ui32 X = (ui32) ceil(log(bibo_l * bibo_l * 1.1f) / M_LN2);
-      u8_SPqcd[s++] = (ui8)((B + X) << 3);
+      double bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true);
+      ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2);
+      u8_SPqcd[s++] = (ui8)(B + X);
+      ui32 max_B_plus_X = (ui32)(B + X);
       for (ui32 d = num_decomps; d > 0; --d)
       {
-        float bibo_l = bibo_gains::get_bibo_gain_l(d, true);
-        float bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true);
-        X = (ui32) ceil(log(bibo_h * bibo_l * 1.1f) / M_LN2);
-        u8_SPqcd[s++] = (ui8)((B + X) << 3);
-        u8_SPqcd[s++] = (ui8)((B + X) << 3);
-        X = (ui32) ceil(log(bibo_h * bibo_h * 1.1f) / M_LN2);
-        u8_SPqcd[s++] = (ui8)((B + X) << 3);
+        double bibo_l = bibo_gains::get_bibo_gain_l(d, true);
+        double bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true);
+        X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2);
+        u8_SPqcd[s++] = (ui8)(B + X);
+        max_B_plus_X = ojph_max(max_B_plus_X, B + X);
+        u8_SPqcd[s++] = (ui8)(B + X);
+        max_B_plus_X = ojph_max(max_B_plus_X, B + X);
+        X = (ui32) ceil(log(bibo_h * bibo_h) / M_LN2);
+        u8_SPqcd[s++] = (ui8)(B + X);
+        max_B_plus_X = ojph_max(max_B_plus_X, B + X);
+      }
+
+      if (max_B_plus_X > 38)
+        OJPH_ERROR(0x00050151, "The specified combination of bit_depth, "
+         "colour transform, and type of wavelet transform requires more than "
+         "38 bits; it requires %d bits. This is beyond what is allowed in "
+         "the JPEG2000 image coding format.", max_B_plus_X);
+
+      int guard_bits = ojph_max(1, (si32)max_B_plus_X - 31);
+      Sqcd = (ui8)(guard_bits << 5);
+      s = 0;
+      u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits));
+      s++;
+      for (ui32 d = num_decomps; d > 0; --d)
+      {
+        u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits));
+        s++;
+        u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits));
+        s++;
+        u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits));
+        s++;
       }
     }
 
@@ -981,8 +1043,11 @@ namespace ojph {
       ui32 B = 0;
       int irrev = Sqcd & 0x1F;
       if (irrev == 0) //reversible
-        for (ui32 i = 0; i < num_subbands; ++i)
-          B = ojph_max(B, (u8_SPqcd[i] >> 3) + get_num_guard_bits() - 1u);
+        for (ui32 i = 0; i < num_subbands; ++i) {
+          ui32 t = decode_SPqcd(u8_SPqcd[i]);
+          t += get_num_guard_bits() - 1u;
+          B = ojph_max(B, t);
+        }
       else if (irrev == 2) //scalar expounded
         for (ui32 i = 0; i < num_subbands; ++i)
         {
@@ -1052,9 +1117,9 @@ namespace ojph {
       }
 
       int irrev = Sqcd & 0x1F;
-      if (irrev == 0) //reversible; this is (10.22) from the J2K book
+      if (irrev == 0) // reversible; this is (10.22) from the J2K book
       {
-        num_bits += u8_SPqcd[idx] >> 3;
+        num_bits += decode_SPqcd(u8_SPqcd[idx]);
         num_bits = num_bits == 0 ? 0 : num_bits - 1;
       }
       else if (irrev == 1)
@@ -1214,6 +1279,239 @@ namespace ojph {
         OJPH_ERROR(0x000500AA, "wrong Sqcc value in QCC marker");
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_nlt::check_validity(param_siz& siz)
+    {
+      if (is_any_enabled() == false)
+        return;
+
+      bool all_same = true;
+      ui32 num_comps = siz.get_num_components();
+
+      // first stage; find out if all components captured by the default
+      // entry (ALL_COMPS) has the same bit_depth/signedness,
+      // while doing this, set the BDnlt for components not captured but the
+      // default entry (ALL_COMPS)
+      ui32 bit_depth = 0;      // unknown yet
+      bool is_signed = false;  // unknown yet
+      for (ui32 c = 0; c < num_comps; ++c)
+      {
+        param_nlt* p = get_comp_object(c);
+        if (p == NULL || !p->enabled) // comp is not in list or not enabled
+        {
+          if (bit_depth == 0)
+          { // this is the first component which has not type 3 nlt definition
+            bit_depth = siz.get_bit_depth(c);
+            is_signed = siz.is_signed(c);
+          }
+          else
+          { // we have seen an undefined component previously
+            all_same = all_same && (bit_depth == siz.get_bit_depth(c));
+            all_same = all_same && (is_signed == siz.is_signed(c));
+          }
+        }
+        else
+        {
+          p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1);
+          p->BDnlt = (ui8)(p->BDnlt | (siz.is_signed(c) ? 0x80 : 0));
+        }
+      }
+
+      // If the default entry is enabled/used, then if the components captured
+      // by it are not the same, we need to create entries for these 
+      // components
+      if (this->enabled)
+      {
+        if (bit_depth != 0) // default captures some components
+        {
+          // captures at least one of the componets in the default entry
+          this->BDnlt = (ui8)((bit_depth - 1) | (is_signed ? 0x80 : (ui8)0));
+
+          if (!all_same)
+          {
+            // We cannot use the default for all components in it, so we 
+            // will keep the first one, and we will also define other
+            // components on their own.
+
+            for (ui32 c = 0; c < num_comps; ++c)
+            {
+              ui32 bd = siz.get_bit_depth(c);
+              bool is = siz.is_signed(c);
+              if (bd != bit_depth || is != is_signed)
+              { 
+                // this component has different bit_depth/signedness than the
+                // default (ALL_COMPS) entry
+                param_nlt* p = get_comp_object(c);
+                if (p == NULL || !p->enabled)
+                {
+                  // this component is captured by the default (ALL_COMPS)
+                  // entry (because it is either not in the list, or 
+                  // not enabled
+                  if (p == NULL)
+                    p = add_object(c);
+                  p->enabled = true;
+                  p->BDnlt = (ui8)((bd - 1) | (is ? 0x80 : 0));
+                }
+              }
+            }
+          }
+        }
+        else
+          this->enabled = false;
+      }
+
+      trim_non_existing_components(num_comps);
+
+      if (is_any_enabled() == false)
+        return;
+      siz.set_Rsiz_flag(param_siz::RSIZ_EXT_FLAG | param_siz::RSIZ_NLT_FLAG);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_nlt::set_type3_transformation(ui32 comp_num, bool enable)
+    {
+      param_nlt* p = get_comp_object(comp_num);
+      if (p == NULL)
+        p = add_object(comp_num);
+      p->enabled = enable;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_nlt::get_type3_transformation(ui32 comp_num, ui8& bit_depth, 
+                                             bool& is_signed) const
+    {
+      const param_nlt* p = get_comp_object(comp_num);
+      p = p ? p : this;
+      if (p->enabled)
+      {
+        bit_depth = (ui8)((p->BDnlt & 0x7F) + 1);
+        bit_depth = bit_depth <= 38 ? bit_depth : 38;
+        is_signed = (p->BDnlt & 0x80) == 0x80;
+      }
+      return p->enabled;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_nlt::write(outfile_base* file) const
+    {
+      if (is_any_enabled() == false)
+        return true;
+
+      char buf[2];
+      bool result = true;
+      const param_nlt* p = this;
+      while (p)
+      {
+        if (p->enabled)
+        {
+          *(ui16*)buf = JP2K_MARKER::NLT;
+          *(ui16*)buf = swap_byte(*(ui16*)buf);
+          result &= file->write(&buf, 2) == 2;
+          *(ui16*)buf = swap_byte(p->Lnlt);
+          result &= file->write(&buf, 2) == 2;
+          *(ui16*)buf = swap_byte(p->Cnlt);
+          result &= file->write(&buf, 2) == 2;
+          result &= file->write(&p->BDnlt, 1) == 1;
+          result &= file->write(&p->Tnlt, 1) == 1;
+        }
+        p = p->next;
+      }
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_nlt::read(infile_base* file)
+    {
+      ui8 buf[6];
+
+      if (file->read(buf, 6) != 6)
+        OJPH_ERROR(0x00050141, "error reading NLT marker segment");
+
+      ui16 length = swap_byte(*(ui16*)buf);
+      if (length != 6 || buf[5] != 3) // wrong length or type
+        OJPH_ERROR(0x00050142, "Unsupported NLT type %d\n", buf[5]);
+
+      ui16 comp = swap_byte(*(ui16*)(buf + 2));
+      param_nlt* p = this;
+      if (comp != special_comp_num::ALL_COMPS)
+      {
+        p = get_comp_object(comp);
+        if (p == NULL)
+          p = add_object(comp);
+      }
+      p->enabled = true;
+      p->Cnlt = comp;
+      p->BDnlt = buf[4];
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    param_nlt* param_nlt::get_comp_object(ui32 comp_num) 
+    { 
+      // cast object to constant
+      const param_nlt* const_p = const_cast<const param_nlt*>(this);
+      // call using the constant object, then cast to non-const
+      return const_cast<param_nlt*>(const_p->get_comp_object(comp_num));
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    const param_nlt* param_nlt::get_comp_object(ui32 comp_num) const
+    {
+      if (Cnlt == comp_num)
+        return this;
+      else {
+        param_nlt* p = next;
+        while (p && p->Cnlt != comp_num)
+          p = p->next;
+        return p;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    param_nlt* param_nlt::add_object(ui32 comp_num)
+    {
+      assert(Cnlt != comp_num);
+      param_nlt* p = this;
+      while (p->next != NULL) {
+        assert(p->Cnlt != comp_num);
+        p = p->next;
+      }
+      p->next = new param_nlt;
+      p->alloced_next = true;
+      p = p->next;
+      p->Cnlt = (ui16)comp_num;
+      return p;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_nlt::is_any_enabled() const
+    {
+      // check if any field is enabled
+      const param_nlt* p = this;
+      while (p && p->enabled == false)
+        p = p->next;
+      return (p != NULL);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_nlt::trim_non_existing_components(ui32 num_comps)
+    {
+      param_nlt* p = this->next;
+      while (p) {
+          if (p->enabled == true && p->Cnlt >= num_comps)
+            p->enabled = false;
+        p = p->next;
+      }
+    }
+
+
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -1239,10 +1537,8 @@ namespace ojph {
       result &= file->write(&buf, 2) == 2;
       *(ui32*)buf = swap_byte(Psot);
       result &= file->write(&buf, 4) == 4;
-      *(ui8*)buf = TPsot;
-      result &= file->write(&buf, 1) == 1;
-      *(ui8*)buf = TNsot;
-      result &= file->write(&buf, 1) == 1;
+      result &= file->write(&TPsot, 1) == 1;
+      result &= file->write(&TNsot, 1) == 1;
 
       return result;
     }
@@ -1263,10 +1559,8 @@ namespace ojph {
       result &= file->write(&buf, 2) == 2;
       *(ui32*)buf = swap_byte(payload_len + 14);
       result &= file->write(&buf, 4) == 4;
-      *(ui8*)buf = TPsot;
-      result &= file->write(&buf, 1) == 1;
-      *(ui8*)buf = TNsot;
-      result &= file->write(&buf, 1) == 1;
+      result &= file->write(&TPsot, 1) == 1;
+      result &= file->write(&TNsot, 1) == 1;
 
       return result;
     }
@@ -1363,7 +1657,7 @@ namespace ojph {
                    "In any case, this limit means that we have 10922 "
                    "tileparts or more, which is a huge number.");
       this->num_pairs = num_pairs;
-      pairs = (Ttlm_Ptlm_pair*)store;
+      pairs = store;
       Ltlm = (ui16)(4 + 6 * num_pairs);
       Ztlm = 0;
       Stlm = 0x60;
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index 1958b8e..cce5cd8 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -138,6 +138,7 @@ namespace ojph {
       COM = 0xFF64, //comment
       DFS = 0xFF72, //downsampling factor styles
       ADS = 0xFF73, //arbitrary decomposition styles
+      NLT = 0xFF76, //non-linearity point transformation
       ATK = 0xFF79, //arbitrary transformation kernels
       SOT = 0xFF90, //start of tile-part
       SOP = 0xFF91, //start of packet
@@ -165,13 +166,26 @@ namespace ojph {
     {
       friend ::ojph::param_siz;
 
+    public:
+      enum : ui16 {
+        RSIZ_NLT_FLAG  =  0x200,
+        RSIZ_HT_FLAG   = 0x4000,
+        RSIZ_EXT_FLAG  = 0x8000,
+      };
+
     public:
       param_siz()
       {
-        memset(this, 0, sizeof(param_siz));
+        Lsiz = Csiz = 0;        
+        Xsiz = Ysiz = XOsiz = YOsiz = XTsiz = YTsiz = XTOsiz = YTOsiz = 0;
+        skipped_resolutions = 0;
+        memset(store, 0, sizeof(store));
+        ws_kern_support_needed = dfs_support_needed = false;
+        cod = NULL;
+        dfs = NULL;
+        Rsiz = RSIZ_HT_FLAG;
         cptr = store;
         old_Csiz = 4;
-        Rsiz = 0x4000; //for jph, bit 14 of Rsiz is 1
       }
 
       ~param_siz()
@@ -255,6 +269,7 @@ namespace ojph {
         ui32 t = ojph_div_ceil(Xsiz, ds) - ojph_div_ceil(XOsiz, ds);
         return t;
       }
+      
       ui32 get_height(ui32 comp_num) const
       {
         assert(comp_num < get_num_components());
@@ -273,6 +288,11 @@ namespace ojph {
       bool is_ws_kern_support_needed() { return ws_kern_support_needed; }
       bool is_dfs_support_needed() { return dfs_support_needed; }
 
+      void set_Rsiz_flag(ui16 flag)
+      { Rsiz |= flag; }
+      void reset_Rsiz_flag(ui16 flag)
+      { Rsiz = (ui16)(Rsiz & ~flag); }
+
     private:
       ui16 Lsiz;
       ui16 Rsiz;
@@ -503,6 +523,9 @@ namespace ojph {
           return (Scod & 4) == 4;
       }
 
+      ////////////////////////////////////////
+      ui32 propose_implementation_precision(const param_siz* siz) const;
+
       ////////////////////////////////////////
       bool write(outfile_base *file);
 
@@ -626,7 +649,11 @@ namespace ojph {
                          bool is_employing_color_transform);
       void set_irrev_quant(ui32 num_decomps);
 
-    protected:
+      ui8 decode_SPqcd(ui8 v) const
+      { return (ui8)(v >> 3); }
+      ui8 encode_SPqcd(ui8 v) const
+      { return (ui8)(v << 3); }
+   protected:
       ui16 Lqcd;
       ui8 Sqcd;
       union
@@ -659,6 +686,64 @@ namespace ojph {
         ui16 comp_idx;
     };
 
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    ///////////////////////////////////////////////////////////////////////////
+    // data structures used by param_nlt
+    struct param_nlt
+    {
+      using special_comp_num = ojph::param_nlt::special_comp_num;
+    public:
+      param_nlt() { 
+        Lnlt = 6;
+        Cnlt = special_comp_num::ALL_COMPS; // default
+        BDnlt = 0;
+        Tnlt = 3;
+        enabled = false; next = NULL; alloced_next = false;
+      }
+
+      ~param_nlt() {
+        if (next && alloced_next) {
+          delete next;
+          alloced_next = false;
+          next = NULL;
+        }
+      }
+
+      void check_validity(param_siz& siz);
+      void set_type3_transformation(ui32 comp_num, bool enable);
+      bool get_type3_transformation(ui32 comp_num, ui8& bit_depth, 
+                                    bool& is_signed) const;
+      bool write(outfile_base* file) const;
+      void read(infile_base* file);
+
+    private:
+      const param_nlt* get_comp_object(ui32 comp_num) const;
+      param_nlt* get_comp_object(ui32 comp_num);
+      param_nlt* add_object(ui32 comp_num);
+      bool is_any_enabled() const;
+      void trim_non_existing_components(ui32 num_comps);
+
+    private:
+      ui16 Lnlt;         // length of the marker segment excluding marker
+      ui16 Cnlt;         // Component involved in the transformation
+      ui8 BDnlt;         // Decoded image component bit depth parameter
+      ui8 Tnlt;          // Type of non-linearity
+      bool enabled;      // true if this object is used
+      param_nlt* next;   // for chaining NLT markers
+      bool alloced_next; // true if next was allocated, not just set to an
+                         // existing object
+
+      // The top level param_nlt object is not allocated, but as part of 
+      // codestream, and is used to manage allocated next objects.
+      // next holds a list of param_nlt objects, which are managed by the top
+      // param_nlt object.
+    };
+
     ///////////////////////////////////////////////////////////////////////////
     //
     //
@@ -792,9 +877,10 @@ namespace ojph {
       };
 
     public: // member functions
-      param_dfs() { memset(this, 0, sizeof(param_dfs)); }
+      param_dfs() { init(); }
       ~param_dfs() { if (next) delete next; }
-      void init() { memset(this, 0, sizeof(param_dfs)); }
+      void init() 
+      { Ldfs = Sdfs = Ids = 0; memset(Ddfs, 0, sizeof(Ddfs)); next = NULL; }
       bool read(infile_base *file);
       bool exists() const { return Ldfs != 0; }
 
@@ -869,8 +955,17 @@ namespace ojph {
       bool read_coefficient(infile_base *file, float &K);
       bool read_coefficient(infile_base *file, si16 &K);
       void init(bool clear_all = true) { 
-        if (clear_all)
-          memset(this, 0, sizeof(param_atk));
+        if (clear_all) 
+        {
+          Latk = Satk = 0;
+          Katk = 0.0f;
+          Natk = 0;
+          d = NULL;
+          max_steps = 0;
+          memset(d_store, 0, sizeof(d_store));
+          next = NULL;
+          alloced_next = false;
+        }
         d = d_store; max_steps = sizeof(d_store) / sizeof(lifting_step);
       }
       void init_irv97();
diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp
index 813e33b..803790d 100644
--- a/src/core/codestream/ojph_precinct.cpp
+++ b/src/core/codestream/ojph_precinct.cpp
@@ -221,7 +221,9 @@ namespace ojph {
               {
                 int num_zeros = *mmsb_tag.get(x>>levm1, y>>levm1, levm1);
                 num_zeros -= *mmsb_tag.get(x>>cur_lev, y>>cur_lev, cur_lev);
-                bb_put_bits(&bb, 1, num_zeros + 1,
+                bb_put_zeros(&bb, num_zeros,
+                  elastic, cur_coded_list, ph_bytes);
+                bb_put_bits(&bb, 1, 1,
                   elastic, cur_coded_list, ph_bytes);
                 *mmsb_tag_flags.get(x>>levm1, y>>levm1, levm1) = 1;
               }
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index b82a810..0246400 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -199,6 +199,9 @@ namespace ojph {
         allocator->pre_alloc_obj<precinct>((size_t)num_precincts.area());
       }
 
+      const param_siz* szp = codestream->get_siz();
+      ui32 precision = cdp->propose_implementation_precision(szp);
+
       //allocate lines
       if (skipped_res_for_recon == false)
       {
@@ -207,10 +210,19 @@ namespace ojph {
         allocator->pre_alloc_obj<lifting_buf>(num_steps + 2);
 
         ui32 width = res_rect.siz.w + 1;
-        for (ui32 i = 0; i < num_steps; ++i)
+        if (precision <= 32) {
+          for (ui32 i = 0; i < num_steps; ++i)
+            allocator->pre_alloc_data<si32>(width, 1);
+          allocator->pre_alloc_data<si32>(width, 1);
           allocator->pre_alloc_data<si32>(width, 1);
-        allocator->pre_alloc_data<si32>(width, 1);
-        allocator->pre_alloc_data<si32>(width, 1);
+        }
+        else 
+        {
+          for (ui32 i = 0; i < num_steps; ++i)
+            allocator->pre_alloc_data<si64>(width, 1);
+          allocator->pre_alloc_data<si64>(width, 1);
+          allocator->pre_alloc_data<si64>(width, 1);
+        }
       }
     }
 
@@ -245,8 +257,8 @@ namespace ojph {
         const param_dfs* dfs = codestream->access_dfs();
         if (dfs == NULL) {
           OJPH_ERROR(0x00070011, "There is a problem with codestream "
-              "marker segments. COD/COC specifies the use of a DFS marker "
-              "but there are no DFS markers within the main codestream "
+            "marker segments. COD/COC specifies the use of a DFS marker "
+            "but there are no DFS markers within the main codestream "
             "headers");
         }
         else {
@@ -436,6 +448,9 @@ namespace ojph {
         level_index[i] = level_index[i - 1] + val;
       cur_precinct_loc = point(0, 0);
 
+      const param_siz* szp = codestream->get_siz();
+      ui32 precision = cdp->propose_implementation_precision(szp);
+
       //allocate lines
       if (skipped_res_for_recon == false)
       {
@@ -460,11 +475,22 @@ namespace ojph {
 
         // initiate storage of line_buf
         ui32 width = res_rect.siz.w + 1;
-        for (ui32 i = 0; i < num_steps; ++i)
-          ssp[i].line->wrap(
-            allocator->post_alloc_data<si32>(width, 1), width, 1);
-        sig->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
-        aug->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+        if (precision <= 32)
+        {
+          for (ui32 i = 0; i < num_steps; ++i)
+            ssp[i].line->wrap(
+              allocator->post_alloc_data<si32>(width, 1), width, 1);
+          sig->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+          aug->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+        }
+        else
+        {
+          for (ui32 i = 0; i < num_steps; ++i)
+            ssp[i].line->wrap(
+              allocator->post_alloc_data<si64>(width, 1), width, 1);
+          sig->line->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
+          aug->line->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
+        }
 
         cur_line = 0;
         rows_to_produce = res_rect.siz.h;
@@ -682,8 +708,9 @@ namespace ojph {
                     rev_horz_syn(atk, aug->line, child_res->pull_line(), 
                       bands[1].pull_line(), width, horz_even);
                   else
-                    memcpy(aug->line->i32, child_res->pull_line()->i32,
-                      width * sizeof(si32));
+                    memcpy(aug->line->p, child_res->pull_line()->p,
+                      (size_t)width 
+                      * (aug->line->flags & line_buf::LFT_SIZE_MASK));
                   aug->active = true;
                   vert_even = !vert_even;
                   ++cur_line;
@@ -694,8 +721,9 @@ namespace ojph {
                     rev_horz_syn(atk, sig->line, bands[2].pull_line(), 
                       bands[3].pull_line(), width, horz_even);
                   else
-                    memcpy(sig->line->i32, bands[2].pull_line()->i32,
-                      width * sizeof(si32));
+                    memcpy(sig->line->p, bands[2].pull_line()->p,
+                      (size_t)width 
+                      * (sig->line->flags & line_buf::LFT_SIZE_MASK));
                   sig->active = true;
                   vert_even = !vert_even;
                   ++cur_line;
@@ -733,8 +761,9 @@ namespace ojph {
                 rev_horz_syn(atk, aug->line, child_res->pull_line(),
                   bands[1].pull_line(), width, horz_even);
               else
-                memcpy(aug->line->i32, child_res->pull_line()->i32,
-                  width * sizeof(si32));
+                memcpy(aug->line->p, child_res->pull_line()->p,
+                  (size_t)width 
+                  * (aug->line->flags & line_buf::LFT_SIZE_MASK));
             }
             else
             {
@@ -742,11 +771,22 @@ namespace ojph {
                 rev_horz_syn(atk, aug->line, bands[2].pull_line(),
                   bands[3].pull_line(), width, horz_even);
               else
-                memcpy(aug->line->i32, bands[2].pull_line()->i32,
-                  width * sizeof(si32));
-              si32* sp = aug->line->i32;
-              for (ui32 i = width; i > 0; --i)
-                *sp++ >>= 1;
+                memcpy(aug->line->p, bands[2].pull_line()->p,
+                  (size_t)width 
+                  * (aug->line->flags & line_buf::LFT_SIZE_MASK));
+              if (aug->line->flags & line_buf::LFT_32BIT)
+              {
+                si32* sp = aug->line->i32;                
+                for (ui32 i = width; i > 0; --i)
+                  *sp++ >>= 1;
+              }
+              else
+              {
+                assert(aug->line->flags & line_buf::LFT_64BIT);
+                si64* sp = aug->line->i64;
+                for (ui32 i = width; i > 0; --i)
+                  *sp++ >>= 1;
+              }
             }
             return aug->line;
           }
@@ -854,8 +894,8 @@ namespace ojph {
             rev_horz_syn(atk, aug->line, child_res->pull_line(),
               bands[1].pull_line(), width, horz_even);
           else
-            memcpy(aug->line->i32, child_res->pull_line()->i32,
-              width * sizeof(si32));
+            memcpy(aug->line->p, child_res->pull_line()->p,
+              (size_t)width * (aug->line->flags & line_buf::LFT_SIZE_MASK));
           return aug->line;
         }
         else
diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h
index 635a4ce..6156455 100644
--- a/src/core/codestream/ojph_resolution.h
+++ b/src/core/codestream/ojph_resolution.h
@@ -45,7 +45,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_elastic_allocator;
   class codestream;
 
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index cf007fc..8efc8de 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -91,13 +91,18 @@ namespace ojph {
       allocator->pre_alloc_obj<coded_cb_header>((size_t)num_blocks.area());
 
       for (ui32 i = 0; i < num_blocks.w; ++i)
-        codeblock::pre_alloc(codestream, nominal);
+        codeblock::pre_alloc(codestream, comp_num, nominal);
 
       //allocate lines
       allocator->pre_alloc_obj<line_buf>(1);
       //allocate line_buf
       ui32 width = band_rect.siz.w + 1;
-      allocator->pre_alloc_data<si32>(width, 1);
+      const param_siz* szp = codestream->get_siz();
+      ui32 precision = cdp->propose_implementation_precision(szp);
+      if (precision <= 32)      
+        allocator->pre_alloc_data<si32>(width, 1);
+      else
+        allocator->pre_alloc_data<si64>(width, 1);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -192,7 +197,12 @@ namespace ojph {
       lines = allocator->post_alloc_obj<line_buf>(1);
       //allocate line_buf
       ui32 width = band_rect.siz.w + 1;
-      lines->wrap(allocator->post_alloc_data<si32>(width,1),width,1);
+      const param_siz* szp = codestream->get_siz();
+      ui32 precision = cdp->propose_implementation_precision(szp);
+      if (precision <= 32)      
+        lines->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+      else
+        lines->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -256,10 +266,11 @@ namespace ojph {
       if (empty)
         return;
 
-      assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size);
-      si32* t = lines[0].i32;
-      lines[0].i32 = l->i32;
-      l->i32 = t;
+      assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size &&
+             l->flags == lines[0].flags);
+      void* p = lines[0].p;
+      lines[0].p = l->p;
+      l->p = p;
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h
index 8cadae0..e1c291a 100644
--- a/src/core/codestream/ojph_subband.h
+++ b/src/core/codestream/ojph_subband.h
@@ -45,7 +45,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_elastic_allocator;
   class codestream;
 
@@ -94,6 +94,8 @@ namespace ojph {
       bool exists() { return !empty; }
 
       line_buf* pull_line();
+      resolution* get_parent() { return parent; }
+      const resolution* get_parent() const { return parent; }
 
     private:
       bool empty;                  // true if the subband has no pixels or
diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index 3be907d..4755bb4 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -67,6 +67,7 @@ namespace ojph {
       allocator->pre_alloc_obj<ui32>(num_comps); //for line_offsets
       allocator->pre_alloc_obj<ui32>(num_comps); //for num_bits
       allocator->pre_alloc_obj<bool>(num_comps); //for is_signed
+      allocator->pre_alloc_obj<bool>(num_comps); //for nlt_type3
       allocator->pre_alloc_obj<ui32>(num_comps); //for cur_line
 
       ui32 tilepart_div = codestream->get_tilepart_div();
@@ -142,6 +143,7 @@ namespace ojph {
 
       //allocate tiles_comp
       const param_siz *szp = codestream->get_siz();
+      const param_nlt *nlp = codestream->get_nlt();
 
       this->num_bytes = 0;
       num_comps = szp->get_num_components();
@@ -152,6 +154,7 @@ namespace ojph {
       line_offsets = allocator->post_alloc_obj<ui32>(num_comps);
       num_bits = allocator->post_alloc_obj<ui32>(num_comps);
       is_signed = allocator->post_alloc_obj<bool>(num_comps);
+      nlt_type3 = allocator->post_alloc_obj<bool>(num_comps);
       cur_line = allocator->post_alloc_obj<ui32>(num_comps);
 
       profile = codestream->get_profile();
@@ -176,6 +179,8 @@ namespace ojph {
       ui32 width = 0;
       for (ui32 i = 0; i < num_comps; ++i)
       {
+        ui8 bd; bool is; // used for nlt_type3
+
         point downsamp = szp->get_downsampling(i);
         point recon_downsamp = szp->get_recon_downsampling(i);
 
@@ -205,6 +210,13 @@ namespace ojph {
 
         num_bits[i] = szp->get_bit_depth(i);
         is_signed[i] = szp->is_signed(i);
+        nlt_type3[i] = nlp->get_type3_transformation(i, bd, is);
+        if (nlt_type3[i] == true && (bd != num_bits[i] || is != is_signed[i]))
+          OJPH_ERROR(0x000300A1, "Mismatch between Ssiz (bit_depth = %d, "
+            "is_signed = %s) from SIZ marker segment, and BDnlt "
+            "(bit_depth = %d, is_signed = %s) from NLT marker segment, "
+            "for component %d",i, num_bits[i], 
+            is_signed[i] ? "True" : "False", bd, is ? "True" : "False");
         cur_line[i] = 0;
       }
 
@@ -219,8 +231,7 @@ namespace ojph {
         num_lines = 3;
         lines = allocator->post_alloc_obj<line_buf>(num_lines);
         for (int i = 0; i < 3; ++i)
-          lines[i].wrap(
-            allocator->post_alloc_data<si32>(width,0),width,0);
+          lines[i].wrap(allocator->post_alloc_data<si32>(width, 0), width, 0);
       }
       else
       {
@@ -247,13 +258,15 @@ namespace ojph {
         line_buf *tc = comps[comp_num].get_line();
         if (reversible)
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          si32* dp = tc->i32;
-          if (is_signed[comp_num])
-            memcpy(dp, sp, comp_width * sizeof(si32));
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width);
+          si64 shift = (si64)1 << (num_bits[comp_num] - 1);
+          if (is_signed[comp_num] && nlt_type3[comp_num])
+            rev_convert_nlt_type3(line, line_offsets[comp_num],
+              tc, 0, shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : -shift;
+            rev_convert(line, line_offsets[comp_num], tc, 0, 
+              shift, comp_width);
+          }
         }
         else
         {
@@ -269,22 +282,25 @@ namespace ojph {
       }
       else
       {
+        si64 shift = (si64)1 << (num_bits[comp_num] - 1);
         ui32 comp_width = comp_rects[comp_num].siz.w;
         if (reversible)
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          si32 *dp = lines[comp_num].i32;
-          if (is_signed[comp_num])
-            memcpy(dp, sp, comp_width * sizeof(si32));
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width);
+          if (is_signed[comp_num] && nlt_type3[comp_num])
+            rev_convert_nlt_type3(line, line_offsets[comp_num], 
+              lines + comp_num, 0, shift + 1, comp_width);            
+          else {
+            shift = is_signed[comp_num] ? 0 : -shift;
+            rev_convert(line, line_offsets[comp_num], lines + comp_num, 0, 
+              shift, comp_width);
+          }
+
           if (comp_num == 2)
           { // reversible color transform
-            rct_forward(lines[0].i32, lines[1].i32, lines[2].i32,
-                        comps[0].get_line()->i32,
-                        comps[1].get_line()->i32,
-                        comps[2].get_line()->i32, comp_width);
+            rct_forward(lines + 0, lines + 1, lines + 2,
+                        comps[0].get_line(),
+                        comps[1].get_line(),
+                        comps[2].get_line(), comp_width);
                         comps[0].push_line();
                         comps[1].push_line();
                         comps[2].push_line();
@@ -330,13 +346,15 @@ namespace ojph {
         ui32 comp_width = recon_comp_rects[comp_num].siz.w;
         if (reversible)
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp = src_line->i32;
-          si32* dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num])
-            memcpy(dp, sp, comp_width * sizeof(si32));
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width);
+          si64 shift = (si64)1 << (num_bits[comp_num] - 1);
+          if (is_signed[comp_num] && nlt_type3[comp_num])
+            rev_convert_nlt_type3(src_line, 0, tgt_line, 
+              line_offsets[comp_num], shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : shift;
+            rev_convert(src_line, 0, tgt_line, 
+              line_offsets[comp_num], shift, comp_width);
+          }
         }
         else
         {
@@ -356,9 +374,9 @@ namespace ojph {
         if (comp_num == 0)
         {
           if (reversible)
-            rct_backward(comps[0].pull_line()->i32, comps[1].pull_line()->i32,
-              comps[2].pull_line()->i32, lines[0].i32, lines[1].i32,
-              lines[2].i32, comp_width);
+            rct_backward(comps[0].pull_line(), comps[1].pull_line(),
+              comps[2].pull_line(), lines + 0, lines + 1,
+              lines + 2, comp_width);
           else
             ict_backward(comps[0].pull_line()->f32, comps[1].pull_line()->f32,
               comps[2].pull_line()->f32, lines[0].f32, lines[1].f32,
@@ -366,17 +384,20 @@ namespace ojph {
         }
         if (reversible)
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp;
+          si64 shift = (si64)1 << (num_bits[comp_num] - 1);
+          line_buf* src_line;
           if (comp_num < 3)
-            sp = lines[comp_num].i32;
-          else
-            sp = comps[comp_num].pull_line()->i32;
-          si32* dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num])
-            memcpy(dp, sp, comp_width * sizeof(si32));
+            src_line = lines + comp_num;
           else
-            cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width);
+            src_line = comps[comp_num].pull_line();
+          if (is_signed[comp_num] && nlt_type3[comp_num])
+            rev_convert_nlt_type3(src_line, 0, tgt_line, 
+              line_offsets[comp_num], shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : shift;
+            rev_convert(src_line, 0, tgt_line, 
+              line_offsets[comp_num], shift, comp_width);
+          }
         }
         else
         {
diff --git a/src/core/codestream/ojph_tile.h b/src/core/codestream/ojph_tile.h
index 056c7c9..6b65a13 100644
--- a/src/core/codestream/ojph_tile.h
+++ b/src/core/codestream/ojph_tile.h
@@ -47,7 +47,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class codestream;
 
   namespace local {
@@ -89,6 +89,7 @@ namespace ojph {
       ui32 *num_bits;
       bool *is_signed;
       ui32 *cur_line;
+      bool *nlt_type3;
       int prog_order;
 
     private:
diff --git a/src/core/codestream/ojph_tile_comp.h b/src/core/codestream/ojph_tile_comp.h
index def39e5..62b8fba 100644
--- a/src/core/codestream/ojph_tile_comp.h
+++ b/src/core/codestream/ojph_tile_comp.h
@@ -48,7 +48,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class codestream;
 
   namespace local {
diff --git a/src/core/coding/ojph_block_common.cpp b/src/core/coding/ojph_block_common.cpp
index e6b4de6..2ba138a 100644
--- a/src/core/coding/ojph_block_common.cpp
+++ b/src/core/coding/ojph_block_common.cpp
@@ -84,11 +84,20 @@ namespace ojph {
      *  + 4 * mel event for initial row of quads when needed                 \n
      *                                                                       \n
      *  Each entry contains, starting from the LSB                           \n
-     *  \li \c total prefix length for quads 0 and 1 (3 bits)                \n
-     *  \li \c total suffix length for quads 0 and 1 (4 bits)                \n
+     *  \li \c total total prefix length for quads 0 and 1 (3 bits)          \n
+     *  \li \c total total suffix length for quads 0 and 1 (4 bits)          \n
      *  \li \c suffix length for quad 0 (3 bits)                             \n
      *  \li \c prefix for quad 0 (3 bits)                                    \n
      *  \li \c prefix for quad 1 (3 bits)                                    \n
+     *                                                                       \n
+     *  Another table is uvlc_bias, which is needed to correctly decode the 
+     *  extension u_ext for initial row of quads. Under certain condition,
+     *  we deduct 1 or 2 from u_q0 and u_q1 before encoding them; so for us 
+     *  to know that decoding u_ext is needed, we recreate the u_q0 and u_q1
+     *  that we actually encoded.                                            \n
+     *  For simplicity, we use the same index as before                      \n
+     *  \li \c u_q0 bias is 2 bits                                           \n
+     *  \li \c u_q1 bias is 2 bits                                           \n
      */
 
     /// @brief uvlc_tbl0 contains decoding information for initial row of quads
@@ -96,6 +105,8 @@ namespace ojph {
     /// @brief uvlc_tbl1 contains decoding information for non-initial row of 
     ///        quads
     ui16 uvlc_tbl1[256] = { 0 };
+    /// @brief uvlc_bias contains decoding info. for initial row of quads
+    ui8 uvlc_bias[256+64] = { 0 };
     /// @}
 
     //************************************************************************/
@@ -199,8 +210,10 @@ namespace ojph {
         ui32 mode = i >> 6;
         ui32 vlc = i & 0x3F;
 
-        if (mode == 0)      // both u_off are 0
+        if (mode == 0) {      // both u_off are 0
           uvlc_tbl0[i] = 0;
+          uvlc_bias[i] = 0;
+        }
         else if (mode <= 2) // u_off are either 01 or 10
         {
           ui32 d = dec[vlc & 0x7];   //look at the least significant 3 bits
@@ -232,6 +245,7 @@ namespace ojph {
             total_suffix = u0_suffix_len;
             u0 = d0 >> 5;
             u1 = (vlc & 1) + 1;
+            uvlc_bias[i] = 4; // 0b00 for u0 and 0b01 for u1
           }
           else
           {
@@ -240,6 +254,7 @@ namespace ojph {
             total_suffix = u0_suffix_len + ((d1 >> 2) & 0x7);
             u0 = d0 >> 5;
             u1 = d1 >> 5;
+            uvlc_bias[i] = 0;
           }
 
           uvlc_tbl0[i] = (ui16)(total_prefix | 
@@ -265,6 +280,7 @@ namespace ojph {
                                (u0_suffix_len << 7) |
                                (u0 << 10) |
                                (u1 << 13));
+          uvlc_bias[i] = 10; // 0b10 for u0 and 0b10 for u1
         }
       }
 
diff --git a/src/core/coding/ojph_block_common.h b/src/core/coding/ojph_block_common.h
index 29a84ba..f8d6503 100644
--- a/src/core/coding/ojph_block_common.h
+++ b/src/core/coding/ojph_block_common.h
@@ -44,6 +44,6 @@ namespace ojph{
     extern ui16 vlc_tbl1[1024];
     extern ui16 uvlc_tbl0[256+64];
     extern ui16 uvlc_tbl1[256];
-
+    extern ui8 uvlc_bias[256+64];
   } // !namespace local
 } // !namespace ojph
diff --git a/src/core/coding/ojph_block_decoder.h b/src/core/coding/ojph_block_decoder.h
index dcd3220..a197017 100644
--- a/src/core/coding/ojph_block_decoder.h
+++ b/src/core/coding/ojph_block_decoder.h
@@ -50,7 +50,12 @@ namespace ojph {
 
     // generic decoder
     bool
-      ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data,
+      ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data,
+        ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
+        ui32 width, ui32 height, ui32 stride, bool stripe_causal);
+
+    bool
+      ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data,
         ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
         ui32 width, ui32 height, ui32 stride, bool stripe_causal);
 
@@ -60,6 +65,12 @@ namespace ojph {
         ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
         ui32 width, ui32 height, ui32 stride, bool stripe_causal);
 
+    // AVX2-accelerated decoder
+    bool
+      ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data,
+        ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
+        ui32 width, ui32 height, ui32 stride, bool stripe_causal);
+
     // WASM SIMD-accelerated decoder
     bool
       ojph_decode_codeblock_wasm(ui8* coded_data, ui32* decoded_data,
diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder32.cpp
similarity index 98%
rename from src/core/coding/ojph_block_decoder.cpp
rename to src/core/coding/ojph_block_decoder32.cpp
index 5be5430..f54c77e 100644
--- a/src/core/coding/ojph_block_decoder.cpp
+++ b/src/core/coding/ojph_block_decoder32.cpp
@@ -739,11 +739,11 @@ namespace ojph {
      *  @param [in]   stride is the decoded codeblock buffer stride 
      *  @param [in]   stripe_causal is true for stripe causal mode
      */
-    bool ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data,
-                               ui32 missing_msbs, ui32 num_passes,
-                               ui32 lengths1, ui32 lengths2,
-                               ui32 width, ui32 height, ui32 stride,
-                               bool stripe_causal)
+    bool ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data,
+                                 ui32 missing_msbs, ui32 num_passes,
+                                 ui32 lengths1, ui32 lengths2,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 bool stripe_causal)
     {
       static bool insufficient_precision = false;
       static bool modify_code = false;
@@ -753,14 +753,14 @@ namespace ojph {
       {
         OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
                               "one coding pass, but zero length for "
-                              "2nd and potential 3rd pass");
+                              "2nd and potential 3rd pass.");
         num_passes = 1;
       }
 
       if (num_passes > 3)
       {
         OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
-                              "This codeblocks has %d passes",
+                              "This codeblocks has %d passes.",
                               num_passes);
         return false;
       }
@@ -772,7 +772,7 @@ namespace ojph {
           insufficient_precision = true;
           OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
                                 "codeblock. This message will not be "
-                                "displayed again");
+                                "displayed again.");
         }
         return false;
       }       
@@ -783,7 +783,7 @@ namespace ojph {
           OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
                                 "pass. The code can be modified to support "
                                 "this case. This message will not be "
-                                "displayed again");
+                                "displayed again.");
         }
          return false;         // 32 bits are not enough to decode this
        }
@@ -796,7 +796,7 @@ namespace ojph {
             OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
                                   "nor MagRef passes; both will be skipped. "
                                   "This message will not be displayed "
-                                  "again");
+                                  "again.");
           }
         }
       }
@@ -806,7 +806,7 @@ namespace ojph {
 
       if (lengths1 < 2)
       {
-        OJPH_WARN(0x00010006, "Wrong codeblock length");
+        OJPH_WARN(0x00010006, "Wrong codeblock length.");
         return false;
       }
 
@@ -1079,7 +1079,7 @@ namespace ojph {
             // quad 0 length
             len = uvlc_entry & 0x7; // quad 0 suffix length
             uvlc_entry >>= 3;
-            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
+            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
             sp[1] = u_q;
             u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
             sp[3] = u_q;
@@ -1217,7 +1217,7 @@ namespace ojph {
 
             ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1?
             ui32 emax = vp[0] | vp[1];
-            emax = 31 - count_leading_zeros(emax | 2); // emax - 1            
+            emax = 31 - count_leading_zeros(emax | 2); // emax - 1
             ui32 kappa = gamma ? emax : 1;
 
             ui32 U_q = u_q + kappa;
@@ -1613,4 +1613,4 @@ namespace ojph {
       return true;
     }
   }
-}
+}
\ No newline at end of file
diff --git a/src/core/coding/ojph_block_decoder64.cpp b/src/core/coding/ojph_block_decoder64.cpp
new file mode 100644
index 0000000..8801735
--- /dev/null
+++ b/src/core/coding/ojph_block_decoder64.cpp
@@ -0,0 +1,1663 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_block_decoder.cpp
+// Author: Aous Naman
+// Date: 13 May 2022
+//***************************************************************************/
+
+//***************************************************************************/
+/** @file ojph_block_decoder.cpp
+ *  @brief implements a HTJ2K block decoder
+ */
+
+#include <string>
+#include <iostream>
+
+#include <cassert>
+#include <cstring>
+#include "ojph_block_common.h"
+#include "ojph_block_decoder.h"
+#include "ojph_arch.h"
+#include "ojph_message.h"
+
+namespace ojph {
+  namespace local {
+
+    //************************************************************************/
+    /** @brief MEL state structure for reading and decoding the MEL bitstream
+     *
+     *  A number of events is decoded from the MEL bitstream ahead of time
+     *  and stored in run/num_runs.
+     *  Each run represents the number of zero events before a one event.
+     */ 
+    struct dec_mel_st {
+      dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
+        k(0), num_runs(0), runs(0)
+      {}
+      // data decoding machinery
+      ui8* data;    //!<the address of data (or bitstream)
+      ui64 tmp;     //!<temporary buffer for read data
+      int bits;     //!<number of bits stored in tmp
+      int size;     //!<number of bytes in MEL code
+      bool unstuff; //!<true if the next bit needs to be unstuffed
+      int k;        //!<state of MEL decoder
+
+      // queue of decoded runs
+      int num_runs; //!<number of decoded runs left in runs (maximum 8)
+      ui64 runs;    //!<runs of decoded MEL codewords (7 bits/run)
+    };
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs the MEL bitstream
+     * 
+     *  This design needs more bytes in the codeblock buffer than the length
+     *  of the cleanup pass by up to 2 bytes.
+     *
+     *  Unstuffing removes the MSB of the byte following a byte whose
+     *  value is 0xFF; this prevents sequences larger than 0xFF7F in value
+     *  from appearing the bitstream.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_read(dec_mel_st *melp)
+    {
+      if (melp->bits > 32)  //there are enough bits in the tmp variable
+        return;             // return without reading new data
+
+      ui32 val = 0xFFFFFFFF;       // feed in 0xFF if buffer is exhausted
+      if (melp->size > 4) {        // if there is data in the MEL segment
+        val = *(ui32*)melp->data;  // read 32 bits from MEL data
+        melp->data += 4;           // advance pointer
+        melp->size -= 4;           // reduce counter
+      }
+      else if (melp->size > 0)
+      { // 4 or less
+        int i = 0;
+        while (melp->size > 1) {   
+          ui32 v = *melp->data++;    // read one byte at a time
+          ui32 m = ~(0xFFu << i);    // mask of location
+          val = (val & m) | (v << i);// put one byte in its correct location
+          --melp->size;
+          i += 8;
+        }
+        // size equal to 1
+        ui32 v = *melp->data++;    // the one before the last is different 
+        v |= 0xF;                  // MEL and VLC segments can overlap
+        ui32 m = ~(0xFFu << i);
+        val = (val & m) | (v << i);
+        --melp->size;
+      }
+      
+      // next we unstuff them before adding them to the buffer
+      int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
+                                     // the previously read byte requires 
+                                     // unstuffing
+
+      // data is unstuffed and accumulated in t
+      // bits has the number of bits in t
+      ui32 t = val & 0xFF; 
+      bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
+      bits -= unstuff; // there is one less bit in t if unstuffing is needed
+      t = t << (8 - unstuff); // move up to make room for the next byte
+
+      //this is a repeat of the above
+      t |= (val>>8) & 0xFF;
+      unstuff = (((val >> 8) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>16) & 0xFF;
+      unstuff = (((val >> 16) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>24) & 0xFF;
+      melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
+
+      // move t to tmp, and push the result all the way up, so we read from
+      // the MSB
+      melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
+      melp->bits += bits; //increment the number of bits in tmp
+    }
+
+    //************************************************************************/
+    /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs
+     * 
+     *  Runs are stored in "runs" and the number of runs in "num_runs".
+     *  Each run represents a number of zero events that may or may not 
+     *  terminate in a 1 event.
+     *  Each run is stored in 7 bits.  The LSB is 1 if the run terminates in
+     *  a 1 event, 0 otherwise.  The next 6 bits, for the case terminating 
+     *  with 1, contain the number of consecutive 0 zero events * 2; for the 
+     *  case terminating with 0, they store (number of consecutive 0 zero 
+     *  events - 1) * 2.
+     *  A total of 6 bits (made up of 1 + 5) should have been enough.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_decode(dec_mel_st *melp)
+    {
+      static const int mel_exp[13] = { //MEL exponents
+        0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
+      };
+
+      if (melp->bits < 6) // if there are less than 6 bits in tmp
+        mel_read(melp);   // then read from the MEL bitstream
+                          // 6 bits is the largest decodable MEL cwd
+
+      //repeat so long that there is enough decodable bits in tmp,
+      // and the runs store is not full (num_runs < 8)
+      while (melp->bits >= 6 && melp->num_runs < 8)
+      {
+        int eval = mel_exp[melp->k]; // number of bits associated with state
+        int run = 0;
+        if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
+        { //one is found
+          run = 1 << eval;  
+          run--; // consecutive runs of 0 events - 1
+          melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
+          melp->tmp <<= 1; // consume one bit from tmp
+          melp->bits -= 1;
+          run = run << 1; // a stretch of zeros not terminating in one
+        }
+        else
+        { //0 is found
+          run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
+          melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
+          melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
+          melp->bits -= eval + 1;
+          run = (run << 1) + 1; // a stretch of zeros terminating with one
+        }
+        eval = melp->num_runs * 7;           // 7 bits per run
+        melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
+        melp->runs |= ((ui64)run) << eval;   // store the value in runs
+        melp->num_runs++;                    // increment count  
+      }
+    }
+
+    //************************************************************************/
+    /** @brief Initiates a dec_mel_st structure for MEL decoding and reads
+     *         some bytes in order to get the read address to a multiple
+     *         of 4 
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     *  @param [in]  bbuf is a pointer to byte buffer
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline
+    void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
+    {
+      melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
+      melp->bits = 0;                  // 0 bits in tmp
+      melp->tmp = 0;                   //
+      melp->unstuff = false;           // no unstuffing
+      melp->size = scup - 1;           // size is the length of MEL+VLC-1
+      melp->k = 0;                     // 0 for state 
+      melp->num_runs = 0;              // num_runs is 0
+      melp->runs = 0;                  //
+
+      //This code is borrowed; original is for a different architecture
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MEL segment
+      int num = 4 - (int)(intptr_t(melp->data) & 0x3);
+      for (int i = 0; i < num; ++i) { // this code is similar to mel_read
+        assert(melp->unstuff == false || melp->data[0] <= 0x8F);
+        ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
+                                                       //set data to 0xFF
+        if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
+                                       // see the standard
+        melp->data += melp->size-- > 0; //increment if the end is not reached
+        int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
+        melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
+        melp->bits += d_bits;  //increment tmp by number of bits
+        melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs 
+                                              //unstuffing
+      }
+      melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
+                                       // is the MSB
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves one run from dec_mel_st; if there are no runs stored
+     *         MEL segment is decoded
+     *
+     * @param [in]  melp is a pointer to dec_mel_st structure
+     */    
+    static inline
+    int mel_get_run(dec_mel_st *melp)
+    {
+      if (melp->num_runs == 0)  //if no runs, decode more bit from MEL segment
+        mel_decode(melp);
+
+      int t = melp->runs & 0x7F; //retrieve one run
+      melp->runs >>= 7;  // remove the retrieved run
+      melp->num_runs--;
+      return t; // return run
+    }
+
+    //************************************************************************/
+    /** @brief A structure for reading and unstuffing a segment that grows
+     *         backward, such as VLC and MRP
+     */ 
+    struct rev_struct {
+      rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
+      {}
+      //storage
+      ui8* data;     //!<pointer to where to read data
+      ui64 tmp;	     //!<temporary buffer of read data
+      ui32 bits;     //!<number of bits stored in tmp
+      int size;      //!<number of bytes left
+      bool unstuff;  //!<true if the last byte is more than 0x8F
+                     //!<then the current byte is unstuffed if it is 0x7F
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuff data from a backwardly-growing segment
+     *
+     *  This reader reads 8 bits from the VLC segment. It fills zeros when 
+     *  the buffer is exhausted; we basically do not care about these zeros 
+     *  because we should not need them -- any extra data should not be used 
+     *  in the actual decoding. If these bytes are needed, then there is a 
+     *  problem in the bitstream, but we do not flag this error.
+     *
+     *  Unstuffing is needed to prevent sequences larger than 0xFF8F from 
+     *  appearing in the bits stream; since we are reading backward, we keep
+     *  watch when a value larger than 0x8F appears in the bitstream. 
+     *  If the byte following this is 0x7F, we unstuff this byte (ignore the 
+     *  MSB of that byte, which should be 0).
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline 
+    void rev_read8(rev_struct *vlcp)
+    {
+      // process 1 bytes
+      ui8 val = 0; // insert 0s at the end -- the standard says that the
+                   // bitstream must contain all needed bits. Therefore
+                   // if the whole bitstream is consumed and bits are still
+                   // needed, then this is an error condition, but we are
+                   // lenient -- it is also possible that we are decoding
+                   // more bits than what we are actually need.
+      if (vlcp->size > 0)  // if there are more than 3 bytes left in VLC
+      {
+        val = *vlcp->data; // then read 8 bits
+        --vlcp->data;      // increment data pointer
+        --vlcp->size;      // decrement number of bytes in the buffer
+      }
+
+      // accumulate in tmp, and increment bits, check if unstuffing is needed
+      ui8 t = (vlcp->unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0;
+      val = (ui8)(val & (0xFFU >> t)); // protect against erroneous 1 in MSB
+      vlcp->tmp |= (ui64)val << vlcp->bits;
+      vlcp->bits += 8 - t;
+      vlcp->unstuff = val > 0x8F;
+    }
+
+    //************************************************************************/
+    /** @brief Initiates the rev_struct structure and reads the first byte
+     *
+     *  This subroutine initializes the VLC decoder.  It discards the first 
+     *  12 bits (they have the sum of the lengths of VLC and MEL segments), 
+     *  and depending on unstuffing, stores 3 or 4 bits in the unstuffed
+     *  decoded buffer.
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline 
+    void rev_init8(rev_struct *vlcp, ui8* data, int lcup, int scup)
+    {
+      //first byte has only the upper 4 bits
+      vlcp->data = data + lcup - 2;
+
+      //size can not be larger than this, in fact it should be smaller
+      vlcp->size = scup - 2;
+
+      ui8 val = *vlcp->data--; // read one byte (this is a half byte)
+
+      // the first byte is treated different to other bytes, because only
+      // the MSB nibble is part of the VLC code.
+      val = (ui8)(val >> 4);
+      ui8 t = ((val & 0x7) == 0x7) ? 1 : 0; // unstuffing is needed
+      val = (ui8)(val & (0xFU >> t)); // protect against erroneous 1 in MSB
+      vlcp->tmp = val;
+      vlcp->bits = 4 - t;
+      vlcp->unstuff = val > 0x8; //this is useful for the next byte
+    }
+
+    //************************************************************************/
+    /** @brief Fills the temporary variable (vlcp->tmp) by up to 64 bits
+     *
+     *  By the end of this call, vlcp->tmp must have no less than 56 bits
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline 
+    ui64 rev_fetch64(rev_struct *vlcp)
+    {
+      while (vlcp->bits <= 56)
+        rev_read8(vlcp); // read 8 bits, but unstuffing might reduce this
+      return vlcp->tmp;  // return unstuff decoded bits
+    }    
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    static inline 
+    ui64 rev_advance64(rev_struct *vlcp, ui32 num_bits)
+    {
+      assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
+      vlcp->tmp >>= num_bits;         // remove bits
+      vlcp->bits -= num_bits;         // decrement the number of bits
+      return vlcp->tmp;
+    }    
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs from rev_struct
+     *
+     *  This is different than rev_read in that this fills in zeros when the
+     *  the available data is consumed.  The other does not care about the
+     *  values when all data is consumed.
+     *
+     *  See rev_read for more information about unstuffing
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline 
+    void rev_read_mrp(rev_struct *mrp)
+    {
+      //process 4 bytes at a time
+      if (mrp->bits > 32)
+        return;
+      ui32 val = 0;
+      if (mrp->size > 3) // If there are 3 byte or more
+      { // (mrp->data - 3) move pointer back to read 32 bits at once
+        val = *(ui32*)(mrp->data - 3); // read 32 bits
+        mrp->data -= 4;                // move back pointer
+        mrp->size -= 4;                // reduce count
+      }
+      else if (mrp->size > 0)
+      {
+        int i = 24;
+        while (mrp->size > 0) {   
+          ui32 v = *mrp->data--; // read one byte at a time
+          val |= (v << i);       // put byte in its correct location
+          --mrp->size;
+          i -= 8;
+        }
+      }
+
+      //accumulate in tmp, and keep count in bits
+      ui32 bits, tmp = val >> 24;
+
+      //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
+      bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
+      bool unstuff = (val >> 24) > 0x8F;
+
+      //process the next byte
+      tmp |= ((val >> 16) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 16) & 0xFF) > 0x8F;
+
+      tmp |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 8) & 0xFF) > 0x8F;
+
+      tmp |= (val & 0xFF) << bits;
+      bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = (val & 0xFF) > 0x8F;
+
+      mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
+      mrp->bits += bits;
+      mrp->unstuff = unstuff;             // next byte
+    }
+
+    //************************************************************************/
+    /** @brief Initialized rev_struct structure for MRP segment, and reads
+     *         a number of bytes such that the next 32 bits read are from
+     *         an address that is a multiple of 4. Note this is designed for
+     *         an architecture that read size must be compatible with the
+     *         alignment of the read address
+     *
+     *  There is another similar subroutine rev_init.  This subroutine does 
+     *  NOT skip the first 12 bits, and starts with unstuff set to true.
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  len2 is the length of SPP+MRP segments
+     */
+    static inline 
+    void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
+    {
+      mrp->data = data + lcup + len2 - 1;
+      mrp->size = len2;
+      mrp->unstuff = true;
+      mrp->bits = 0;
+      mrp->tmp = 0;
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MRP stream
+      int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
+      for (int i = 0; i < num; ++i) {
+        ui64 d;
+        //read a byte, 0 if no more data
+        d = (mrp->size-- > 0) ? *mrp->data-- : 0; 
+        //check if unstuffing is needed
+        ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
+        mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
+        mrp->bits += d_bits;
+        mrp->unstuff = d > 0x8F; // for next byte
+      }
+      rev_read_mrp(mrp);
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves 32 bits from the head of a rev_struct structure 
+     *
+     *  By the end of this call, mrp->tmp must have no less than 33 bits
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline 
+    ui32 rev_fetch_mrp(rev_struct *mrp)
+    {
+      if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
+      {
+        rev_read_mrp(mrp);    // read 30-32 bits from mrp
+        if (mrp->bits < 32)   // if there is a space of 32 bits
+          rev_read_mrp(mrp);  // read more
+      }
+      return (ui32)mrp->tmp;  // return the head of mrp->tmp
+    }
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    static inline 
+    ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
+    {
+      assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
+      mrp->tmp >>= num_bits;  // discard the lowest num_bits bits
+      mrp->bits -= num_bits;
+      return (ui32)mrp->tmp;  // return data after consumption
+    }
+
+    //************************************************************************/
+    /** @brief State structure for reading and unstuffing of forward-growing 
+     *         bitstreams; these are: MagSgn and SPP bitstreams
+     */
+    struct frwd_struct {
+      const ui8* data;  //!<pointer to bitstream
+      ui64 tmp;         //!<temporary buffer of read data
+      ui32 bits;        //!<number of bits stored in tmp
+      ui32 unstuff;     //!<1 if a bit needs to be unstuffed from next byte
+      int size;         //!<size of data
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuffs 32 bits from forward-growing bitstream
+     *  
+     *  A template is used to accommodate a different requirement for
+     *  MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is
+     *  consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in.
+     *  X controls this value.
+     *
+     *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
+     *  in the conpressed sequence.  So whenever a value of 0xFF is coded, the
+     *  MSB of the next byte is set 0 and must be ignored during decoding.
+     *
+     *  Reading can go beyond the end of buffer by up to 3 bytes.
+     *
+     *  @tparam       X is the value fed in when the bitstream is exhausted
+     *  @param  [in]  msp is a pointer to frwd_struct structure
+     *
+     */ 
+    template<int X>
+    static inline 
+    void frwd_read(frwd_struct *msp)
+    {
+      assert(msp->bits <= 32); // assert that there is a space for 32 bits
+
+      ui32 val = 0;
+      if (msp->size > 3) {
+        val = *(ui32*)msp->data;  // read 32 bits
+        msp->data += 4;           // increment pointer
+        msp->size -= 4;           // reduce size
+      }
+      else if (msp->size > 0)
+      {
+        int i = 0;
+        val = X != 0 ? 0xFFFFFFFFu : 0;
+        while (msp->size > 0) {   
+          ui32 v = *msp->data++;    // read one byte at a time
+          ui32 m = ~(0xFFu << i);    // mask of location
+          val = (val & m) | (v << i);// put one byte in its correct location
+          --msp->size;
+          i += 8;          
+        }
+      }
+      else
+        val = X != 0 ? 0xFFFFFFFFu : 0;
+
+      // we accumulate in t and keep a count of the number of bits in bits
+      ui32 bits = 8 - msp->unstuff;        
+      ui32 t = val & 0xFF;
+      bool unstuff = ((val & 0xFF) == 0xFF);  // Do we need unstuffing next?
+
+      t |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - unstuff;
+      unstuff = (((val >> 8) & 0xFF) == 0xFF);
+
+      t |= ((val >> 16) & 0xFF) << bits;
+      bits += 8 - unstuff;
+      unstuff = (((val >> 16) & 0xFF) == 0xFF);
+
+      t |= ((val >> 24) & 0xFF) << bits;
+      bits += 8 - unstuff;
+      msp->unstuff = (((val >> 24) & 0xFF) == 0xFF); // for next byte
+
+      msp->tmp |= ((ui64)t) << msp->bits;  // move data to msp->tmp
+      msp->bits += bits;
+    }
+
+    //************************************************************************/
+    /** @brief Read and unstuffs 8 bits from forward-growing bitstream
+     *  
+     *  A template is used to accommodate a different requirement for
+     *  MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is
+     *  consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in.
+     *  X controls this value.
+     *
+     *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
+     *  in the conpressed sequence.  So whenever a value of 0xFF is coded, the
+     *  MSB of the next byte is set 0 and must be ignored during decoding.
+     *
+     *  @tparam       X is the value fed in when the bitstream is exhausted
+     *  @param  [in]  msp is a pointer to frwd_struct structure
+     *
+     */ 
+    template<ui8 X>
+    static inline 
+    void frwd_read8(frwd_struct *msp)
+    {
+      ui8 val = X;
+      if (msp->size > 0) {
+        val = *msp->data;  // read 8 bits
+        ++msp->data;      // increment pointer
+        --msp->size;      // reduce size
+      }
+
+      // unstuff and accumulate
+      ui8 t = msp->unstuff ? 1 : 0;
+      val = (ui8)(val & (0xFFU >> t));
+      msp->unstuff = (val == 0xFF);
+      msp->tmp |= ((ui64)val) << msp->bits;  // move data to msp->tmp
+      msp->bits += 8 - t;
+    }
+
+    //************************************************************************/
+    /** @brief Initialize frwd_struct struct and reads some bytes
+     *  
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  data is a pointer to the start of data
+     *  @param [in]  size is the number of byte in the bitstream
+     */
+    template<int X>
+    static inline
+    void frwd_init(frwd_struct *msp, const ui8* data, int size)
+    {
+      msp->data = data;
+      msp->tmp = 0;
+      msp->bits = 0;
+      msp->unstuff = 0;
+      msp->size = size;
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the bitstream
+      int num = 4 - (int)(intptr_t(msp->data) & 0x3);
+      for (int i = 0; i < num; ++i)
+      {
+        ui64 d;
+        //read a byte if the buffer is not exhausted, otherwise set it to X
+        d = msp->size-- > 0 ? *msp->data++ : X;
+        msp->tmp |= (d << msp->bits);      // store data in msp->tmp
+        msp->bits += 8 - msp->unstuff;     // number of bits added to msp->tmp
+        msp->unstuff = ((d & 0xFF) == 0xFF); // unstuffing for next byte
+      }
+      frwd_read<X>(msp); // read 32 bits more
+    }
+
+    //************************************************************************/
+    /** @brief Initialize frwd_struct struct and reads some bytes
+     *  
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  data is a pointer to the start of data
+     *  @param [in]  size is the number of byte in the bitstream
+     */
+    template<ui8 X>
+    static inline
+    void frwd_init8(frwd_struct *msp, const ui8* data, int size)
+    {
+      msp->data = data;
+      msp->tmp = 0;
+      msp->bits = 0;
+      msp->unstuff = 0;
+      msp->size = size;
+      frwd_read8<X>(msp); // read 8 bits
+    }
+
+    //************************************************************************/
+    /** @brief Consume num_bits bits from the bitstream of frwd_struct
+     *
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  num_bits is the number of bit to consume
+     */
+    static inline 
+    void frwd_advance(frwd_struct *msp, ui32 num_bits)
+    {
+      assert(num_bits <= msp->bits);
+      msp->tmp >>= num_bits;  // consume num_bits
+      msp->bits -= num_bits;
+    }
+
+    //************************************************************************/
+    /** @brief Fetches 32 bits from the frwd_struct bitstream
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     */
+    template<int X>
+    static inline 
+    ui32 frwd_fetch(frwd_struct *msp)
+    {
+      if (msp->bits < 32)
+      {
+        frwd_read<X>(msp);
+        if (msp->bits < 32) //need to test
+          frwd_read<X>(msp);
+      }
+      return (ui32)msp->tmp;
+    }
+
+    //************************************************************************/
+    /** @brief Fetches up to 64 bits from the frwd_struct bitstream
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     */
+    template<ui8 X>
+    static inline 
+    ui64 frwd_fetch64(frwd_struct *msp)
+    {
+      while (msp->bits <= 56)
+        frwd_read8<X>(msp);
+      return msp->tmp;
+    }    
+
+    //************************************************************************/
+    /** @brief Decodes one codeblock, processing the cleanup, siginificance
+     *         propagation, and magnitude refinement pass
+     *
+     *  @param [in]   coded_data is a pointer to bitstream
+     *  @param [in]   decoded_data is a pointer to decoded codeblock data buf.
+     *  @param [in]   missing_msbs is the number of missing MSBs
+     *  @param [in]   num_passes is the number of passes: 1 if CUP only,
+     *                2 for CUP+SPP, and 3 for CUP+SPP+MRP
+     *  @param [in]   lengths1 is the length of cleanup pass
+     *  @param [in]   lengths2 is the length of refinement passes (either SPP
+     *                only or SPP+MRP)
+     *  @param [in]   width is the decoded codeblock width 
+     *  @param [in]   height is the decoded codeblock height
+     *  @param [in]   stride is the decoded codeblock buffer stride 
+     *  @param [in]   stripe_causal is true for stripe causal mode
+     */
+    bool ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data,
+                                 ui32 missing_msbs, ui32 num_passes,
+                                 ui32 lengths1, ui32 lengths2,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 bool stripe_causal)
+    {
+      // static bool insufficient_precision = false;
+      // static bool modify_code = false;
+      // static bool truncate_spp_mrp = false;
+
+      if (num_passes > 1 && lengths2 == 0)
+      {
+        OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
+                              "one coding pass, but zero length for "
+                              "2nd and potential 3rd pass.");
+        num_passes = 1;
+      }
+
+      if (num_passes > 3)
+      {
+        OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
+                              "This codeblocks has %d passes.",
+                              num_passes);
+        return false;
+      }
+
+      // if (missing_msbs > 30) // p < 0
+      // {
+      //   if (insufficient_precision == false) 
+      //   {
+      //     insufficient_precision = true;
+      //     OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
+      //                           "codeblock. This message will not be "
+      //                           "displayed again.");
+      //   }
+      //   return false;
+      // }       
+      // else if (missing_msbs == 30) // p == 0
+      // { // not enough precision to decode and set the bin center to 1
+      //   if (modify_code == false) {
+      //     modify_code = true;
+      //     OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
+      //                           "pass. The code can be modified to support "
+      //                           "this case. This message will not be "
+      //                           "displayed again.");
+      //   }
+      //    return false;         // 32 bits are not enough to decode this
+      //  }
+      // else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
+      // {
+      //   if (num_passes > 1) {
+      //     num_passes = 1;
+      //     if (truncate_spp_mrp == false) {
+      //       truncate_spp_mrp = true;
+      //       OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
+      //                             "nor MagRef passes; both will be skipped. "
+      //                             "This message will not be displayed "
+      //                             "again.");
+      //     }
+      //   }
+      // }
+      ui32 p = 62 - missing_msbs; // The least significant bitplane for CUP
+      // There is a way to handle the case of p == 0, but a different path
+      // is required
+
+      if (lengths1 < 2)
+      {
+        OJPH_WARN(0x00010006, "Wrong codeblock length.");
+        return false;
+      }
+
+      // read scup and fix the bytes there
+      int lcup, scup;
+      lcup = (int)lengths1;  // length of CUP
+      //scup is the length of MEL + VLC
+      scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
+      if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
+        return false;
+
+      // The temporary storage scratch holds two types of data in an 
+      // interleaved fashion. The interleaving allows us to use one
+      // memory pointer.
+      // We have one entry for a decoded VLC code, and one entry for UVLC.
+      // Entries are 16 bits each, corresponding to one quad, 
+      // but since we want to use XMM registers of the SSE family 
+      // of SIMD; we allocated 16 bytes or more per quad row; that is,
+      // the width is no smaller than 16 bytes (or 8 entries), and the
+      // height is 512 quads
+      // Each VLC entry contains, in the following order, starting 
+      // from MSB
+      // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
+      // Each entry in UVLC contains u_q
+      // One extra row to handle the case of SPP propagating downwards
+      // when codeblock width is 4
+      ui16 scratch[8 * 513] = {0};       // 8 kB
+
+      // We need an extra two entries (one inf and one u_q) beyond
+      // the last column. 
+      // If the block width is 4 (2 quads), then we use sstr of 8 
+      // (enough for 4 quads). If width is 8 (4 quads) we use 
+      // sstr is 16 (enough for 8 quads). For a width of 16 (8 
+      // quads), we use 24 (enough for 12 quads).
+      ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
+
+      ui32 mmsbp2 = missing_msbs + 2;
+
+      // The cleanup pass is decoded in two steps; in step one,
+      // the VLC and MEL segments are decoded, generating a record that 
+      // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
+      // This information should be sufficient for the next step.
+      // In step 2, we decode the MagSgn segment.
+
+      // step 1 decoding VLC and MEL segments
+      {
+        // init structures
+        dec_mel_st mel;
+        mel_init(&mel, coded_data, lcup, scup);
+        rev_struct vlc;
+        rev_init8(&vlc, coded_data, lcup, scup);
+
+        int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
+                                     // data represented as runs of 0 events
+                                     // See mel_decode description
+
+        ui64 vlc_val;
+        ui32 c_q = 0;
+        ui16 *sp = scratch;
+        //initial quad row
+        for (ui32 x = 0; x < width; sp += 4)
+        {
+          // decode VLC
+          /////////////
+
+          // first quad
+          vlc_val = rev_fetch64(&vlc);
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
+
+          // if context is zero, use one MEL event
+          if (c_q == 0) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // Is the run terminated in 1? if so, use decoded VLC code, 
+            // otherwise, discard decoded data, since we will decoded again 
+            // using a different context
+            t0 = (run == -1) ? t0 : 0;
+
+            // is run -1 or -2? this means a run has been consumed
+            if (run < 0) 
+              run = mel_get_run(&mel);  // get another run
+          }
+          //run -= (c_q == 0) ? 2 : 0;
+          //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[0] = t0;
+          x += 2;
+
+          // prepare context for the next quad; eqn. 1 in ITU T.814
+          c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
+
+          //remove data from vlc stream (0 bits are removed if vlc is not used)
+          vlc_val = rev_advance64(&vlc, t0 & 0x7);
+
+          //second quad
+          ui16 t1 = 0;
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; 
+
+          // if context is zero, use one MEL event
+          if (c_q == 0 && x < width) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // if event is 0, discard decoded t1
+            t1 = (run == -1) ? t1 : 0;
+
+            if (run < 0) // have we consumed all events in a run
+              run = mel_get_run(&mel); // if yes, then get another run
+          }
+          t1 = x < width ? t1 : 0;
+          //run -= (c_q == 0 && x < width) ? 2 : 0;
+          //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[2] = t1;
+          x += 2;
+
+          //prepare context for the next quad, eqn. 1 in ITU T.814
+          c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
+
+          //remove data from vlc stream, if qinf is not used, cwdlen is 0
+          vlc_val = rev_advance64(&vlc, t1 & 0x7);
+          
+          // decode u
+          /////////////
+          // uvlc_mode is made up of u_offset bits from the quad pair
+          ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+          if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
+          {                     // the MEL run of events
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
+                                                 // is 0x40
+
+            if (run < 0)//if run is consumed (run is -1 or -2), get another run
+              run = mel_get_run(&mel);
+          }
+          //run -= (uvlc_mode == 0xc0) ? 2 : 0;
+          //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+
+          //decode uvlc_mode to get u for both quads
+          ui32 idx = uvlc_mode + (ui32)(vlc_val & 0x3F);
+          ui32 uvlc_entry = uvlc_tbl0[idx];
+          ui16 u_bias = uvlc_bias[idx];          
+          //remove total prefix length
+          vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); 
+          uvlc_entry >>= 3; 
+          //extract suffixes for quad 0 and 1
+          ui32 len = uvlc_entry & 0xF;             // suffix length for 2 quads
+          ui32 tmp = (ui32)(vlc_val&((1<<len)-1)); // suffix value for 2 quads
+          vlc_val = rev_advance64(&vlc, len);
+          uvlc_entry >>= 4;
+          // quad 0 length
+          len = uvlc_entry & 0x7; // quad 0 suffix length
+          uvlc_entry >>= 3;
+          ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
+          ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len));
+
+          // decode u_q extensions, which is needed only when u_q > 32
+          ui16 u_ext; bool cond0, cond1;
+          cond0 = u_q0 - (u_bias & 0x3) > 32;
+          u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0);
+          vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0);
+          u_q0 = (ui16)(u_q0 + (u_ext << 2));
+          sp[1] = (ui16)(u_q0 + 1); // kappa = 1
+          cond1 = u_q1 - (u_bias >> 2) > 32;
+          u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0);
+          vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0);
+          u_q1 = (ui16)(u_q1 + (u_ext << 2));
+          sp[3] = (ui16)(u_q1 + 1); // kappa = 1
+        }
+        sp[0] = sp[1] = 0;
+
+        //non initial quad rows
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          c_q = 0;                                // context
+          ui16 *sp = scratch + (y >> 1) * sstr;   // this row of quads
+
+          for (ui32 x = 0; x < width; sp += 4)
+          {
+            // decode VLC
+            /////////////
+
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
+
+            // first quad
+            vlc_val = rev_fetch64(&vlc);
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0) //zero context
+            {
+              run -= 2; //subtract 2, since events number is multiplied by 2
+
+              // Is the run terminated in 1? if so, use decoded VLC code, 
+              // otherwise, discard decoded data, since we will decoded again 
+              // using a different context
+              t0 = (run == -1) ? t0 : 0;
+
+              // is run -1 or -2? this means a run has been consumed
+              if (run < 0) 
+                run = mel_get_run(&mel);  // get another run
+            }
+            //run -= (c_q == 0) ? 2 : 0;
+            //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[0] = t0;
+            x += 2;
+
+            // prepare context for the next quad; eqn. 2 in ITU T.814
+            // sigma_q (w, sw)
+            c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[0 - (si32)sstr] & 0x80;
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
+
+            //remove data from vlc stream (0 bits are removed if vlc is unused)
+            vlc_val = rev_advance64(&vlc, t0 & 0x7);
+
+            //second quad
+            ui16 t1 = 0;
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; 
+
+            // if context is zero, use one MEL event
+            if (c_q == 0 && x < width) //zero context
+            {
+              run -= 2; //subtract 2, since events number if multiplied by 2
+
+              // if event is 0, discard decoded t1
+              t1 = (run == -1) ? t1 : 0;
+
+              if (run < 0) // have we consumed all events in a run
+                run = mel_get_run(&mel); // if yes, then get another run
+            }
+            t1 = x < width ? t1 : 0;
+            //run -= (c_q == 0 && x < width) ? 2 : 0;
+            //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[2] = t1;
+            x += 2;
+
+            // partial c_q, will be completed when we process the next quad
+            // sigma_q (w, sw)
+            c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[2 - (si32)sstr] & 0x80;
+
+            //remove data from vlc stream, if qinf is not used, cwdlen is 0
+            vlc_val = rev_advance64(&vlc, t1 & 0x7);
+          
+            // decode u
+            /////////////
+            // uvlc_mode is made up of u_offset bits from the quad pair
+            ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+            ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
+            //remove total prefix length
+            vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7);
+            uvlc_entry >>= 3;
+            //extract suffixes for quad 0 and 1
+            ui32 len = uvlc_entry & 0xF;             //suffix length for 2 quads
+            ui32 tmp = (ui32)(vlc_val&((1<<len)-1)); //suffix value for 2 quads
+            vlc_val = rev_advance64(&vlc, len);
+            uvlc_entry >>= 4;
+            // quad 0 length
+            len = uvlc_entry & 0x7; // quad 0 suffix length
+            uvlc_entry >>= 3;
+            ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
+            ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
+
+            // decode u_q extensions, which is needed only when u_q > 32
+            ui16 u_ext; bool cond0, cond1;
+            cond0 = u_q0 > 32;
+            u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0);
+            vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0);
+            u_q0 = (ui16)(u_q0 + (u_ext << 2));
+            sp[1] = u_q0;
+            cond1 = u_q1 > 32;
+            u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0);
+            vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0);
+            u_q1 = (ui16)(u_q1 + (u_ext << 2));
+            sp[3] = u_q1;
+          }
+          sp[0] = sp[1] = 0;
+        }
+      }
+
+      // step2 we decode magsgn
+      {
+        // We allocate a scratch row for storing v_n values.
+        // We have 512 quads horizontally.
+        // We need an extra entry to handle the case of vp[1]
+        // when vp is at the last column.
+        // Here, we allocate 4 instead of 1 to make the buffer size
+        // a multipled of 16 bytes.
+        const int v_n_size = 512 + 4;
+        ui64 v_n_scratch[v_n_size] = {0};  // 4+ kB
+
+        frwd_struct magsgn;
+        frwd_init8<0xFF>(&magsgn, coded_data, lcup - scup);
+
+        const ui16 *sp = scratch;
+        ui64 *vp = v_n_scratch;
+        ui64 *dp = decoded_data;
+
+        ui64 prev_v_n = 0;
+        for (ui32 x = 0; x < width; sp += 2, ++vp)
+        {
+          ui32 inf = sp[0];
+          ui32 U_q = sp[1];
+          if (U_q > mmsbp2)
+            return false;
+
+          ui64 v_n;
+          ui64 val = 0;
+          ui32 bit = 0;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[0] = val;
+
+          v_n = 0;
+          val = 0;
+          bit = 1;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[stride] = val;
+          vp[0] = prev_v_n | v_n;
+          prev_v_n = 0;
+          ++dp;
+          if (++x >= width)
+          { ++vp; break; }
+
+          val = 0;
+          bit = 2;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[0] = val;
+
+          v_n = 0;
+          val = 0;
+          bit = 3;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[stride] = val;
+          prev_v_n = v_n;
+          ++dp;
+          ++x;
+        }
+        vp[0] = prev_v_n;
+
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          const ui16 *sp = scratch + (y >> 1) * sstr;
+          ui64 *vp = v_n_scratch;
+          ui64 *dp = decoded_data + y * stride;
+
+          prev_v_n = 0;
+          for (ui32 x = 0; x < width; sp += 2, ++vp)
+          {
+            ui32 inf = sp[0];
+            ui32 u_q = sp[1];
+
+            ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1?
+            ui32 emax = 63 - count_leading_zeros(2 | vp[0] | vp[1]); // emax-1
+            ui32 kappa = gamma ? emax : 1;
+
+            ui32 U_q = u_q + kappa;
+            if (U_q > mmsbp2)
+              return false;
+
+            ui64 v_n;
+            ui64 val = 0;
+            ui32 bit = 0;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[0] = val;
+
+            v_n = 0;
+            val = 0;
+            bit = 1;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[stride] = val;
+            vp[0] = prev_v_n | v_n;
+            prev_v_n = 0;
+            ++dp;
+            if (++x >= width)
+            { ++vp; break; }
+
+            val = 0;
+            bit = 2;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[0] = val;
+
+            v_n = 0;
+            val = 0;
+            bit = 3;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[stride] = val;
+            prev_v_n = v_n;
+            ++dp;
+            ++x;
+          }
+          vp[0] = prev_v_n;
+        }
+      }
+
+      if (num_passes > 1)
+      {
+        // We use scratch again, we can divide it into multiple regions
+        // sigma holds all the significant samples, and it cannot
+        // be modified after it is set.  it will be used during the
+        // Magnitude Refinement Pass
+        ui16* const sigma = scratch;
+
+        ui32 mstr = (width + 3u) >> 2;   // divide by 4, since each
+                                         // ui16 contains 4 columns
+        mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
+
+        // We re-arrange quad significance, where each 4 consecutive
+        // bits represent one quad, into column significance, where,
+        // each 4 consequtive bits represent one column of 4 rows
+        {
+          ui32 y;
+          for (y = 0; y < height; y += 4)
+          {
+            ui16* sp = scratch + (y >> 1) * sstr;
+            ui16* dp = sigma + (y >> 2) * mstr;
+            for (ui32 x = 0; x < width; x += 4, sp += 4, ++dp) {
+              ui32 t0 = 0, t1 = 0;
+              t0  = ((sp[0     ] & 0x30u) >> 4)  | ((sp[0     ] & 0xC0u) >> 2);
+              t0 |= ((sp[2     ] & 0x30u) << 4)  | ((sp[2     ] & 0xC0u) << 6);
+              t1  = ((sp[0+sstr] & 0x30u) >> 2)  | ((sp[0+sstr] & 0xC0u)     );
+              t1 |= ((sp[2+sstr] & 0x30u) << 6)  | ((sp[2+sstr] & 0xC0u) << 8);
+              dp[0] = (ui16)(t0 | t1);
+            }
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+          {
+            // reset one row after the codeblock
+            ui16* dp = sigma + (y >> 2) * mstr;
+            for (ui32 x = 0; x < width; x += 4, ++dp)
+              dp[0] = 0;
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+        }
+
+        // We perform Significance Propagation Pass here
+        {
+          // This stores significance information of the previous
+          // 4 rows.  Significance information in this array includes
+          // all signicant samples in bitplane p - 1; that is,
+          // significant samples for bitplane p (discovered during the
+          // cleanup pass and stored in sigma) and samples that have recently
+          // became significant (during the SPP) in bitplane p-1.
+          // We store enough for the widest row, containing 1024 columns,
+          // which is equivalent to 256 of ui16, since each stores 4 columns.
+          // We add an extra 8 entries, just in case we need more
+          ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
+
+          frwd_struct sigprop;
+          frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui32 pattern = 0xFFFFu; // a pattern needed samples
+            if (height - y < 4) {
+              pattern = 0x7777u;
+              if (height - y < 3) {
+                pattern = 0x3333u;
+                if (height - y < 2)
+                  pattern = 0x1111u;
+              }
+            }
+
+            // prev holds sign. info. for the previous quad, together
+            // with the rows on top of it and below it.
+            ui32 prev = 0;
+            ui16 *prev_sig = prev_row_sig;
+            ui16 *cur_sig = sigma + (y >> 2) * mstr;
+            ui64 *dpp = decoded_data + y * stride;
+            for (ui32 x = 0; x < width; x += 4, ++cur_sig, ++prev_sig)
+            {
+              // only rows and columns inside the stripe are included
+              si32 s = (si32)x + 4 - (si32)width;
+              s = ojph_max(s, 0);
+              pattern = pattern >> (s * 4);
+
+              // We first find locations that need to be tested (potential
+              // SPP members); these location will end up in mbr
+              // In each iteration, we produce 16 bits because cwd can have
+              // up to 16 bits of significance information, followed by the
+              // corresponding 16 bits of sign information; therefore, it is
+              // sufficient to fetch 32 bit data per loop.
+
+              // Althougth we are interested in 16 bits only, we load 32 bits.
+              // For the 16 bits we are producing, we need the next 4 bits --
+              // We need data for at least 5 columns out of 8.
+              // Therefore loading 32 bits is easier than loading 16 bits
+              // twice.
+              ui32 ps = *(ui32*)prev_sig;
+              ui32 ns = *(ui32*)(cur_sig + mstr);
+              ui32 u = (ps & 0x88888888) >> 3; // the row on top
+              if (!stripe_causal)
+                u |= (ns & 0x11111111) << 3;   // the row below
+
+              ui32 cs = *(ui32*)cur_sig;
+              // vertical integration
+              ui32 mbr =  cs;                // this sig. info.
+              mbr |= (cs & 0x77777777) << 1; //above neighbors
+              mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
+              mbr |= u;
+              // horizontal integration
+              ui32 t = mbr;
+              mbr |= t << 4;      // neighbors on the left
+              mbr |= t >> 4;      // neighbors on the right
+              mbr |= prev >> 12;  // significance of previous group
+
+              // remove outside samples, and already significant samples
+              mbr &= pattern;
+              mbr &= ~cs;
+
+              // find samples that become significant during the SPP
+              ui32 new_sig = mbr;
+              if (new_sig)
+              {
+                ui64 cwd = frwd_fetch<0>(&sigprop);
+
+                ui32 cnt = 0;
+                ui32 col_mask = 0xFu;
+                ui32 inv_sig = ~cs & pattern;
+                for (int i = 0; i < 16; i += 4, col_mask <<= 4)
+                {
+                  if ((col_mask & new_sig) == 0)
+                    continue;
+
+                  //scan one column
+                  ui32 sample_mask = 0x1111u & col_mask;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x33u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x76u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xECu << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xC8u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+                }
+
+                if (new_sig)
+                {
+                  // new_sig has newly-discovered sig. samples during SPP
+                  // find the signs and update decoded_data
+                  ui64 *dp = dpp + x;
+                  ui64 val = 3u << (p - 2);
+                  col_mask = 0xFu;
+                  for (int i = 0; i < 4; ++i, ++dp, col_mask <<= 4)
+                  {
+                    if ((col_mask & new_sig) == 0)
+                      continue;
+
+                    //scan 4 signs
+                    ui32 sample_mask = 0x1111u & col_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[0] == 0);
+                      dp[0] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+
+                    sample_mask += sample_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[stride] == 0);
+                      dp[stride] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+
+                    sample_mask += sample_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[2 * stride] == 0);
+                      dp[2 * stride] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+
+                    sample_mask += sample_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[3 * stride] == 0);
+                      dp[3 * stride] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+                  }
+                }
+                frwd_advance(&sigprop, cnt);
+              }
+
+              new_sig |= cs;
+              *prev_sig = (ui16)(new_sig);
+
+              // vertical integration for the new sig. info.
+              t = new_sig;
+              new_sig |= (t & 0x7777) << 1; //above neighbors
+              new_sig |= (t & 0xEEEE) >> 1; //below neighbors
+              // add sig. info. from the row on top and below
+              prev = new_sig | u;
+              // we need only the bits in 0xF000
+              prev &= 0xF000;
+            }
+          }
+        }
+
+        // We perform Magnitude Refinement Pass here
+        if (num_passes > 2)
+        {
+          rev_struct magref;
+          rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui32 *cur_sig = (ui32*)(sigma + (y >> 2) * mstr);
+            ui64 *dpp = decoded_data + y * stride;
+            ui64 half = 1ULL << (p - 2);
+            for (ui32 i = 0; i < width; i += 8)
+            {
+              //Process one entry from sigma array at a time
+              // Each nibble (4 bits) in the sigma array represents 4 rows,
+              // and the 32 bits contain 8 columns
+              ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
+              ui32 sig = *cur_sig++; // 32 bit that will be processed now
+              ui32 col_mask = 0xFu;  // a mask for a column in sig
+              if (sig) // if any of the 32 bits are set
+              {
+                for (int j = 0; j < 8; ++j) //one column at a time
+                {
+                  if (sig & col_mask) // lowest nibble
+                  {
+                    ui64 *dp = dpp + i + j; // next column in decoded samples
+                    ui32 sample_mask = 0x11111111u & col_mask; //LSB
+
+                    for (int k = 0; k < 4; ++k) {
+                      if (sig & sample_mask) //if LSB is set
+                      {
+                        assert(dp[0] != 0); // decoded value cannot be zero
+                        assert((dp[0] & half) == 0); // no half
+                        ui64 sym = cwd & 1;          // get it value
+                        sym = (1 - sym) << (p - 1); // previous center of bin
+                        sym |= half;            // put half the center of bin
+                        dp[0] ^= sym;    // remove old bin center and put new
+                        cwd >>= 1;       // consume word
+                      }
+                      sample_mask += sample_mask; //next row
+                      dp += stride; // next samples row
+                    }
+                  }
+                  col_mask <<= 4; //next column
+                }
+              }
+              // consume data according to the number of bits set
+              rev_advance_mrp(&magref, population_count(sig));
+            }
+          }
+        }
+      }
+      return true;
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/core/coding/ojph_block_decoder_avx2.cpp b/src/core/coding/ojph_block_decoder_avx2.cpp
new file mode 100644
index 0000000..156ba1a
--- /dev/null
+++ b/src/core/coding/ojph_block_decoder_avx2.cpp
@@ -0,0 +1,2041 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2022, Aous Naman
+// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2022, The University of New South Wales, Australia
+// Copyright (c) 2024, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_block_decoder_avx2.cpp
+//***************************************************************************/
+
+//***************************************************************************/
+/** @file ojph_block_decoder_avx2.cpp
+ *  @brief implements a faster HTJ2K block decoder using avx2
+ */
+
+#include <string>
+#include <iostream>
+
+#include <cassert>
+#include <cstring>
+#include "ojph_block_common.h"
+#include "ojph_block_decoder.h"
+#include "ojph_arch.h"
+#include "ojph_message.h"
+
+#include <immintrin.h>
+
+namespace ojph {
+  namespace local {
+
+    //************************************************************************/
+    /** @brief MEL state structure for reading and decoding the MEL bitstream
+     *
+     *  A number of events is decoded from the MEL bitstream ahead of time
+     *  and stored in run/num_runs.
+     *  Each run represents the number of zero events before a one event.
+     */
+    struct dec_mel_st {
+      dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
+        k(0), num_runs(0), runs(0)
+      {}
+      // data decoding machinery
+      ui8* data;    //!<the address of data (or bitstream)
+      ui64 tmp;     //!<temporary buffer for read data
+      int bits;     //!<number of bits stored in tmp
+      int size;     //!<number of bytes in MEL code
+      bool unstuff; //!<true if the next bit needs to be unstuffed
+      int k;        //!<state of MEL decoder
+
+      // queue of decoded runs
+      int num_runs; //!<number of decoded runs left in runs (maximum 8)
+      ui64 runs;    //!<runs of decoded MEL codewords (7 bits/run)
+    };
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs the MEL bitstream
+     *
+     *  This design needs more bytes in the codeblock buffer than the length
+     *  of the cleanup pass by up to 2 bytes.
+     *
+     *  Unstuffing removes the MSB of the byte following a byte whose
+     *  value is 0xFF; this prevents sequences larger than 0xFF7F in value
+     *  from appearing the bitstream.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_read(dec_mel_st *melp)
+    {
+      if (melp->bits > 32)  //there are enough bits in the tmp variable
+        return;             // return without reading new data
+
+      ui32 val = 0xFFFFFFFF;       // feed in 0xFF if buffer is exhausted
+      if (melp->size > 4) {        // if there is data in the MEL segment
+        val = *(ui32*)melp->data;  // read 32 bits from MEL data
+        melp->data += 4;           // advance pointer
+        melp->size -= 4;           // reduce counter
+      }
+      else if (melp->size > 0)
+      { // 4 or less
+        int i = 0;
+        while (melp->size > 1) {
+          ui32 v = *melp->data++;    // read one byte at a time
+          ui32 m = ~(0xFFu << i);    // mask of location
+          val = (val & m) | (v << i);// put one byte in its correct location
+          --melp->size;
+          i += 8;
+        }
+        // size equal to 1
+        ui32 v = *melp->data++;    // the one before the last is different
+        v |= 0xF;                  // MEL and VLC segments can overlap
+        ui32 m = ~(0xFFu << i);
+        val = (val & m) | (v << i);
+        --melp->size;
+      }
+
+      // next we unstuff them before adding them to the buffer
+      int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
+                                     // the previously read byte requires
+                                     // unstuffing
+
+      // data is unstuffed and accumulated in t
+      // bits has the number of bits in t
+      ui32 t = val & 0xFF;
+      bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
+      bits -= unstuff; // there is one less bit in t if unstuffing is needed
+      t = t << (8 - unstuff); // move up to make room for the next byte
+
+      //this is a repeat of the above
+      t |= (val>>8) & 0xFF;
+      unstuff = (((val >> 8) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>16) & 0xFF;
+      unstuff = (((val >> 16) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>24) & 0xFF;
+      melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
+
+      // move t to tmp, and push the result all the way up, so we read from
+      // the MSB
+      melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
+      melp->bits += bits; //increment the number of bits in tmp
+    }
+
+    //************************************************************************/
+    /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs
+     *
+     *  Runs are stored in "runs" and the number of runs in "num_runs".
+     *  Each run represents a number of zero events that may or may not
+     *  terminate in a 1 event.
+     *  Each run is stored in 7 bits.  The LSB is 1 if the run terminates in
+     *  a 1 event, 0 otherwise.  The next 6 bits, for the case terminating
+     *  with 1, contain the number of consecutive 0 zero events * 2; for the
+     *  case terminating with 0, they store (number of consecutive 0 zero
+     *  events - 1) * 2.
+     *  A total of 6 bits (made up of 1 + 5) should have been enough.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_decode(dec_mel_st *melp)
+    {
+      static const int mel_exp[13] = { //MEL exponents
+        0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
+      };
+
+      if (melp->bits < 6) // if there are less than 6 bits in tmp
+        mel_read(melp);   // then read from the MEL bitstream
+                          // 6 bits is the largest decodable MEL cwd
+
+      //repeat so long that there is enough decodable bits in tmp,
+      // and the runs store is not full (num_runs < 8)
+      while (melp->bits >= 6 && melp->num_runs < 8)
+      {
+        int eval = mel_exp[melp->k]; // number of bits associated with state
+        int run = 0;
+        if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
+        { //one is found
+          run = 1 << eval;
+          run--; // consecutive runs of 0 events - 1
+          melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
+          melp->tmp <<= 1; // consume one bit from tmp
+          melp->bits -= 1;
+          run = run << 1; // a stretch of zeros not terminating in one
+        }
+        else
+        { //0 is found
+          run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
+          melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
+          melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
+          melp->bits -= eval + 1;
+          run = (run << 1) + 1; // a stretch of zeros terminating with one
+        }
+        eval = melp->num_runs * 7;           // 7 bits per run
+        melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
+        melp->runs |= ((ui64)run) << eval;   // store the value in runs
+        melp->num_runs++;                    // increment count
+      }
+    }
+
+    //************************************************************************/
+    /** @brief Initiates a dec_mel_st structure for MEL decoding and reads
+     *         some bytes in order to get the read address to a multiple
+     *         of 4
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     *  @param [in]  bbuf is a pointer to byte buffer
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline
+    void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
+    {
+      melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
+      melp->bits = 0;                  // 0 bits in tmp
+      melp->tmp = 0;                   //
+      melp->unstuff = false;           // no unstuffing
+      melp->size = scup - 1;           // size is the length of MEL+VLC-1
+      melp->k = 0;                     // 0 for state
+      melp->num_runs = 0;              // num_runs is 0
+      melp->runs = 0;                  //
+
+      //This code is borrowed; original is for a different architecture
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MEL segment
+      int num = 4 - (int)(intptr_t(melp->data) & 0x3);
+      for (int i = 0; i < num; ++i) { // this code is similar to mel_read
+        assert(melp->unstuff == false || melp->data[0] <= 0x8F);
+        ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
+                                                       //set data to 0xFF
+        if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
+                                       // see the standard
+        melp->data += melp->size-- > 0; //increment if the end is not reached
+        int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
+        melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
+        melp->bits += d_bits;  //increment tmp by number of bits
+        melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
+                                              //unstuffing
+      }
+      melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
+                                       // is the MSB
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves one run from dec_mel_st; if there are no runs stored
+     *         MEL segment is decoded
+     *
+     * @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    int mel_get_run(dec_mel_st *melp)
+    {
+      if (melp->num_runs == 0)  //if no runs, decode more bit from MEL segment
+        mel_decode(melp);
+
+      int t = melp->runs & 0x7F; //retrieve one run
+      melp->runs >>= 7;  // remove the retrieved run
+      melp->num_runs--;
+      return t; // return run
+    }
+
+    //************************************************************************/
+    /** @brief A structure for reading and unstuffing a segment that grows
+     *         backward, such as VLC and MRP
+     */
+    struct rev_struct {
+      rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
+      {}
+      //storage
+      ui8* data;     //!<pointer to where to read data
+      ui64 tmp;	     //!<temporary buffer of read data
+      ui32 bits;     //!<number of bits stored in tmp
+      int size;      //!<number of bytes left
+      bool unstuff;  //!<true if the last byte is more than 0x8F
+                     //!<then the current byte is unstuffed if it is 0x7F
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuff data from a backwardly-growing segment
+     *
+     *  This reader can read up to 8 bytes from before the VLC segment.
+     *  Care must be taken not read from unreadable memory, causing a
+     *  segmentation fault.
+     *
+     *  Note that there is another subroutine rev_read_mrp that is slightly
+     *  different.  The other one fills zeros when the buffer is exhausted.
+     *  This one basically does not care if the bytes are consumed, because
+     *  any extra data should not be used in the actual decoding.
+     *
+     *  Unstuffing is needed to prevent sequences more than 0xFF8F from
+     *  appearing in the bits stream; since we are reading backward, we keep
+     *  watch when a value larger than 0x8F appears in the bitstream.
+     *  If the byte following this is 0x7F, we unstuff this byte (ignore the
+     *  MSB of that byte, which should be 0).
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline
+    void rev_read(rev_struct *vlcp)
+    {
+      //process 4 bytes at a time
+      if (vlcp->bits > 32)  // if there are more than 32 bits in tmp, then
+        return;             // reading 32 bits can overflow vlcp->tmp
+      ui32 val = 0;
+      //the next line (the if statement) needs to be tested first
+      if (vlcp->size > 3)  // if there are more than 3 bytes left in VLC
+      {
+        // (vlcp->data - 3) move pointer back to read 32 bits at once
+        val = *(ui32*)(vlcp->data - 3); // then read 32 bits
+        vlcp->data -= 4;          // move data pointer back by 4
+        vlcp->size -= 4;          // reduce available byte by 4
+      }
+      else if (vlcp->size > 0)
+      { // 4 or less
+        int i = 24;
+        while (vlcp->size > 0) {
+          ui32 v = *vlcp->data--; // read one byte at a time
+          val |= (v << i);        // put byte in its correct location
+          --vlcp->size;
+          i -= 8;
+        }
+      }
+
+      __m128i tmp_vec = _mm_set1_epi32((int32_t)val);
+      tmp_vec = _mm_srlv_epi32(tmp_vec, _mm_setr_epi32(24, 16, 8, 0));
+      tmp_vec = _mm_and_si128(tmp_vec, _mm_set1_epi32(0xff));
+
+      __m128i unstuff_vec = _mm_cmpgt_epi32(tmp_vec, _mm_set1_epi32(0x8F));
+      bool unstuff_next = _mm_extract_epi32(unstuff_vec, 3);
+      unstuff_vec = _mm_slli_si128(unstuff_vec, 4);
+      unstuff_vec = _mm_insert_epi32(unstuff_vec, vlcp->unstuff * 0xffffffff, 0);
+
+      __m128i val_7f = _mm_set1_epi32(0x7F);
+      __m128i this_byte_7f = _mm_cmpeq_epi32(_mm_and_si128(tmp_vec, val_7f), val_7f);
+      unstuff_vec = _mm_and_si128(unstuff_vec, this_byte_7f);
+      unstuff_vec = _mm_srli_epi32(unstuff_vec, 31);
+
+      __m128i inc_sum = _mm_sub_epi32(_mm_set1_epi32(8), unstuff_vec);
+      inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
+      inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
+      ui32 total_bits = (ui32)_mm_extract_epi32(inc_sum, 3);
+
+      __m128i final_shift = _mm_slli_si128(inc_sum, 4);
+      tmp_vec = _mm_sllv_epi32(tmp_vec, final_shift);
+      tmp_vec = _mm_or_si128(tmp_vec, _mm_bsrli_si128(tmp_vec, 8));
+
+      ui64 tmp = (ui32)_mm_cvtsi128_si32(tmp_vec) | (ui32)_mm_extract_epi32(tmp_vec, 1);
+
+      vlcp->unstuff = unstuff_next;
+      vlcp->tmp |= tmp << vlcp->bits;
+      vlcp->bits += total_bits;
+    }
+
+    //************************************************************************/
+    /** @brief Initiates the rev_struct structure and reads a few bytes to
+     *         move the read address to multiple of 4
+     *
+     *  There is another similar rev_init_mrp subroutine.  The difference is
+     *  that this one, rev_init, discards the first 12 bits (they have the
+     *  sum of the lengths of VLC and MEL segments), and first unstuff depends
+     *  on first 4 bits.
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline
+    void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
+    {
+      //first byte has only the upper 4 bits
+      vlcp->data = data + lcup - 2;
+
+      //size can not be larger than this, in fact it should be smaller
+      vlcp->size = scup - 2;
+
+      ui32 d = *vlcp->data--; // read one byte (this is a half byte)
+      vlcp->tmp = d >> 4;    // both initialize and set
+      vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
+      vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
+      // To read 32 bits, read from (vlcp->data - 3)
+      int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
+      int tnum = num < vlcp->size ? num : vlcp->size;
+      for (int i = 0; i < tnum; ++i) {
+        ui64 d;
+        d = *vlcp->data--;  // read one byte and move read pointer
+        //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
+        ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
+        vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
+        vlcp->bits += d_bits;
+        vlcp->unstuff = d > 0x8F; // for next byte
+      }
+      vlcp->size -= tnum;
+      rev_read(vlcp);  // read another 32 buts
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves 32 bits from the head of a rev_struct structure
+     *
+     *  By the end of this call, vlcp->tmp must have no less than 33 bits
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline
+    ui32 rev_fetch(rev_struct *vlcp)
+    {
+      if (vlcp->bits < 32)  // if there are less then 32 bits, read more
+      {
+        rev_read(vlcp);     // read 32 bits, but unstuffing might reduce this
+        if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
+          rev_read(vlcp);   // read another 32
+      }
+      return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
+    }
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    static inline
+    ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
+    {
+      assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
+      vlcp->tmp >>= num_bits;         // remove bits
+      vlcp->bits -= num_bits;         // decrement the number of bits
+      return (ui32)vlcp->tmp;
+    }
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs from rev_struct
+     *
+     *  This is different than rev_read in that this fills in zeros when the
+     *  the available data is consumed.  The other does not care about the
+     *  values when all data is consumed.
+     *
+     *  See rev_read for more information about unstuffing
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline
+    void rev_read_mrp(rev_struct *mrp)
+    {
+      //process 4 bytes at a time
+      if (mrp->bits > 32)
+        return;
+      ui32 val = 0;
+      if (mrp->size > 3) // If there are 3 byte or more
+      { // (mrp->data - 3) move pointer back to read 32 bits at once
+        val = *(ui32*)(mrp->data - 3); // read 32 bits
+        mrp->data -= 4;                // move back pointer
+        mrp->size -= 4;                // reduce count
+      }
+      else if (mrp->size > 0)
+      {
+        int i = 24;
+        while (mrp->size > 0) {
+          ui32 v = *mrp->data--; // read one byte at a time
+          val |= (v << i);       // put byte in its correct location
+          --mrp->size;
+          i -= 8;
+        }
+      }
+
+      //accumulate in tmp, and keep count in bits
+      ui32 bits, tmp = val >> 24;
+
+      //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
+      bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
+      bool unstuff = (val >> 24) > 0x8F;
+
+      //process the next byte
+      tmp |= ((val >> 16) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 16) & 0xFF) > 0x8F;
+
+      tmp |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 8) & 0xFF) > 0x8F;
+
+      tmp |= (val & 0xFF) << bits;
+      bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = (val & 0xFF) > 0x8F;
+
+      mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
+      mrp->bits += bits;
+      mrp->unstuff = unstuff;             // next byte
+    }
+
+    //************************************************************************/
+    /** @brief Initialized rev_struct structure for MRP segment, and reads
+     *         a number of bytes such that the next 32 bits read are from
+     *         an address that is a multiple of 4. Note this is designed for
+     *         an architecture that read size must be compatible with the
+     *         alignment of the read address
+     *
+     *  There is another similar subroutine rev_init.  This subroutine does
+     *  NOT skip the first 12 bits, and starts with unstuff set to true.
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  len2 is the length of SPP+MRP segments
+     */
+    static inline
+    void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
+    {
+      mrp->data = data + lcup + len2 - 1;
+      mrp->size = len2;
+      mrp->unstuff = true;
+      mrp->bits = 0;
+      mrp->tmp = 0;
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MRP stream
+      int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
+      for (int i = 0; i < num; ++i) {
+        ui64 d;
+        //read a byte, 0 if no more data
+        d = (mrp->size-- > 0) ? *mrp->data-- : 0;
+        //check if unstuffing is needed
+        ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
+        mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
+        mrp->bits += d_bits;
+        mrp->unstuff = d > 0x8F; // for next byte
+      }
+      rev_read_mrp(mrp);
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves 32 bits from the head of a rev_struct structure
+     *
+     *  By the end of this call, mrp->tmp must have no less than 33 bits
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline
+    ui32 rev_fetch_mrp(rev_struct *mrp)
+    {
+      if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
+      {
+        rev_read_mrp(mrp);    // read 30-32 bits from mrp
+        if (mrp->bits < 32)   // if there is a space of 32 bits
+          rev_read_mrp(mrp);  // read more
+      }
+      return (ui32)mrp->tmp;  // return the head of mrp->tmp
+    }
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
+    {
+      assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
+      mrp->tmp >>= num_bits;  // discard the lowest num_bits bits
+      mrp->bits -= num_bits;
+      return (ui32)mrp->tmp;  // return data after consumption
+    }
+
+    //************************************************************************/
+    /** @brief State structure for reading and unstuffing of forward-growing
+     *         bitstreams; these are: MagSgn and SPP bitstreams
+     */
+    struct frwd_struct {
+      const ui8* data;  //!<pointer to bitstream
+      ui8 tmp[48];      //!<temporary buffer of read data + 16 extra
+      ui32 bits;        //!<number of bits stored in tmp
+      ui32 unstuff;     //!<1 if a bit needs to be unstuffed from next byte
+      int size;         //!<size of data
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuffs 16 bytes from forward-growing bitstream
+     *
+     *  A template is used to accommodate a different requirement for
+     *  MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is
+     *  consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in.
+     *  X controls this value.
+     *
+     *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
+     *  in the compressed sequence.  So whenever a value of 0xFF is coded, the
+     *  MSB of the next byte is set 0 and must be ignored during decoding.
+     *
+     *  Reading can go beyond the end of buffer by up to 16 bytes.
+     *
+     *  @tparam       X is the value fed in when the bitstream is exhausted
+     *  @param  [in]  msp is a pointer to frwd_struct structure
+     *
+     */
+    template<int X>
+    static inline
+    void frwd_read(frwd_struct *msp)
+    {
+      assert(msp->bits <= 128);
+
+      __m128i offset, val, validity, all_xff;
+      val = _mm_loadu_si128((__m128i*)msp->data);
+      int bytes = msp->size >= 16 ? 16 : msp->size;
+      validity = _mm_set1_epi8((char)bytes);
+      msp->data += bytes;
+      msp->size -= bytes;
+      int bits = 128;
+      offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100);
+      validity = _mm_cmpgt_epi8(validity, offset);
+      all_xff = _mm_set1_epi8(-1);
+      if (X == 0xFF) // the compiler should remove this if statement
+      {
+        __m128i t = _mm_xor_si128(validity, all_xff); // complement
+        val = _mm_or_si128(t, val); // fill with 0xFF
+      }
+      else if (X == 0)
+        val = _mm_and_si128(validity, val); // fill with zeros
+      else
+        assert(0);
+
+      __m128i ff_bytes;
+      ff_bytes = _mm_cmpeq_epi8(val, all_xff);
+      ff_bytes = _mm_and_si128(ff_bytes, validity);
+      ui32 flags = (ui32)_mm_movemask_epi8(ff_bytes);
+      flags <<= 1; // unstuff following byte
+      ui32 next_unstuff = flags >> 16;
+      flags |= msp->unstuff;
+      flags &= 0xFFFF;
+      while (flags)
+      { // bit unstuffing occurs on average once every 256 bytes
+        // therefore it is not an issue if it is a bit slow
+        // here we process 16 bytes
+        --bits; // consuming one stuffing bit
+
+        ui32 loc = 31 - count_leading_zeros(flags);
+        flags ^= 1 << loc;
+
+        __m128i m, t, c;
+        t = _mm_set1_epi8((char)loc);
+        m = _mm_cmpgt_epi8(offset, t);
+
+        t = _mm_and_si128(m, val);  // keep bits at locations larger than loc
+        c = _mm_srli_epi64(t, 1);   // 1 bits left
+        t = _mm_srli_si128(t, 8);   // 8 bytes left
+        t = _mm_slli_epi64(t, 63);  // keep the MSB only
+        t = _mm_or_si128(t, c);     // combine the above 3 steps
+
+        val = _mm_or_si128(t, _mm_andnot_si128(m, val));
+      }
+
+      // combine with earlier data
+      assert(msp->bits >= 0 && msp->bits <= 128);
+      int cur_bytes = msp->bits >> 3;
+      int cur_bits = msp->bits & 7;
+      __m128i b1, b2;
+      b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits));
+      b2 = _mm_slli_si128(val, 8);  // 8 bytes right
+      b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits));
+      b1 = _mm_or_si128(b1, b2);
+      b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes));
+      b2 = _mm_or_si128(b1, b2);
+      _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2);
+
+      int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits;
+      cur_bytes = (msp->bits + (ui32)consumed_bits + 7) >> 3; // round up
+      int upper = _mm_extract_epi16(val, 7);
+      upper >>= consumed_bits - 128 + 16;
+      msp->tmp[cur_bytes] = (ui8)upper; // copy byte
+
+      msp->bits += (ui32)bits;
+      msp->unstuff = next_unstuff;   // next unstuff
+      assert(msp->unstuff == 0 || msp->unstuff == 1);
+    }
+
+    //************************************************************************/
+    /** @brief Initialize frwd_struct struct and reads some bytes
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  data is a pointer to the start of data
+     *  @param [in]  size is the number of byte in the bitstream
+     */
+    template<int X>
+    static inline
+    void frwd_init(frwd_struct *msp, const ui8* data, int size)
+    {
+      msp->data = data;
+      _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128());
+      _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128());
+      _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128());
+
+      msp->bits = 0;
+      msp->unstuff = 0;
+      msp->size = size;
+
+      frwd_read<X>(msp); // read 128 bits more
+    }
+
+    //************************************************************************/
+    /** @brief Consume num_bits bits from the bitstream of frwd_struct
+     *
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  num_bits is the number of bit to consume
+     */
+    static inline
+    void frwd_advance(frwd_struct *msp, ui32 num_bits)
+    {
+      assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
+      msp->bits -= num_bits;
+
+      __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18));
+      num_bits &= 63;
+
+      __m128i v0, v1, c0, c1, t;
+      v0 = _mm_loadu_si128(p);
+      v1 = _mm_loadu_si128(p + 1);
+
+      // shift right by num_bits
+      c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits));
+      t = _mm_srli_si128(v0, 8);
+      t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
+      c0 = _mm_or_si128(c0, t);
+      t = _mm_slli_si128(v1, 8);
+      t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
+      c0 = _mm_or_si128(c0, t);
+
+      _mm_storeu_si128((__m128i*)msp->tmp, c0);
+
+      c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits));
+      t = _mm_srli_si128(v1, 8);
+      t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
+      c1 = _mm_or_si128(c1, t);
+
+      _mm_storeu_si128((__m128i*)msp->tmp + 1, c1);
+    }
+
+    //************************************************************************/
+    /** @brief Fetches 32 bits from the frwd_struct bitstream
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     */
+    template<int X>
+    static inline
+    __m128i frwd_fetch(frwd_struct *msp)
+    {
+      if (msp->bits <= 128)
+      {
+        frwd_read<X>(msp);
+        if (msp->bits <= 128) //need to test
+          frwd_read<X>(msp);
+      }
+      __m128i t = _mm_loadu_si128((__m128i*)msp->tmp);
+      return t;
+    }
+
+    //************************************************************************/
+    /** @brief decodes twos consecutive quads (one octet), using 32 bit data
+     *
+     *  @param inf_u_q  decoded VLC code, with interleaved u values
+     *  @param U_q      U values
+     *  @param magsgn   structure for forward data buffer
+     *  @param p        bitplane at which we are decoding
+     *  @param vn       used for handling E values (stores v_n values)
+     *  @return __m256i decoded two quads
+     */
+    static inline __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q, frwd_struct* magsgn, ui32 p, __m128i& vn) {
+        __m256i row = _mm256_setzero_si256();
+
+        // we keeps e_k, e_1, and rho in w2
+        __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
+        __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
+
+        if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
+        {
+            flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
+
+            // U_q holds U_q for this quad
+            // flags has e_k, e_1, and rho such that e_k is sitting in the
+            // 0x8000, e_1 in 0x800, and rho in 0x80
+
+            // next e_k and m_n
+            __m256i m_n;
+            __m256i w0 = _mm256_srli_epi32(flags, 15); // e_k
+            m_n = _mm256_sub_epi32(U_q, w0);
+            m_n = _mm256_andnot_si256(insig, m_n);
+
+            // find cumulative sums
+            // to find at which bit in ms_vec the sample starts
+            __m256i inc_sum = m_n; // inclusive scan
+            inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
+            inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
+            int total_mn1 = _mm256_extract_epi16(inc_sum, 6);
+            int total_mn2 = _mm256_extract_epi16(inc_sum, 14);
+
+            __m128i ms_vec0 = _mm_setzero_si128();
+            __m128i ms_vec1 = _mm_setzero_si128();
+            if (total_mn1) {
+                ms_vec0 = frwd_fetch<0xFF>(magsgn);
+                frwd_advance(magsgn, (ui32)total_mn1);
+            }
+            if (total_mn2) {
+                ms_vec1 = frwd_fetch<0xFF>(magsgn);
+                frwd_advance(magsgn, (ui32)total_mn2);
+            }
+
+            __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
+
+            __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4); // exclusive scan
+
+            // find the starting byte and starting bit
+            __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
+            __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
+            byte_idx = _mm256_shuffle_epi8(byte_idx,
+                _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
+            byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
+            __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
+            byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
+            __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
+
+            // shift samples values to correct location
+            bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
+
+            __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
+            __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
+
+            __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
+            bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
+            d0 = _mm256_mullo_epi16(d0, bit_shift);
+            d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
+            d1 = _mm256_mullo_epi16(d1, bit_shift);
+            d1 = _mm256_and_si256(d1, _mm256_set1_epi32((si32)0xFF00FF00)); // 8 in MSB
+            d0 = _mm256_or_si256(d0, d1);
+
+            // find location of e_k and mask
+            __m256i shift;
+            __m256i ones = _mm256_set1_epi32(1);
+            __m256i twos = _mm256_set1_epi32(2);
+            __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
+            U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
+            U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
+            w0 = _mm256_sub_epi32(twos, w0);
+            shift = _mm256_sllv_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31
+            ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
+
+            // next e_1
+            w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
+            w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
+            w0 = _mm256_andnot_si256(w0, shift);  // e_1 in correct position
+            ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
+            w0 = _mm256_slli_epi32(ms_vec, 31);   // sign
+            ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
+            __m256i tvn = ms_vec;
+            ms_vec = _mm256_add_epi32(ms_vec, twos);// + 2
+            ms_vec = _mm256_slli_epi32(ms_vec, (si32)p - 1);
+            ms_vec = _mm256_or_si256(ms_vec, w0); // sign
+            row = _mm256_andnot_si256(insig, ms_vec); // significant only
+
+            ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
+
+            tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
+
+            vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
+            vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
+        }
+        return row;
+    }
+
+
+   //************************************************************************/
+    /** @brief decodes twos consecutive quads (one octet), using 16 bit data
+     *
+     *  @param inf_u_q  decoded VLC code, with interleaved u values
+     *  @param U_q      U values
+     *  @param magsgn   structure for forward data buffer
+     *  @param p        bitplane at which we are decoding
+     *  @param vn       used for handling E values (stores v_n values)
+     *  @return __m128i decoded quad
+     */
+
+    static inline __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct* magsgn, ui32 p, __m128i& vn) {
+
+        __m256i w0;     // workers
+        __m256i insig;  // lanes hold FF's if samples are insignificant
+        __m256i flags;  // lanes hold e_k, e_1, and rho
+
+        __m256i row = _mm256_setzero_si256();
+        __m128i ddd = _mm_shuffle_epi8(inf_u_q,
+            _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
+        w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
+            _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
+        // we keeps e_k, e_1, and rho in w2
+        flags = _mm256_and_si256(w0,
+            _mm256_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110,
+                             (si16)0x8880, 0x4440, 0x2220, 0x1110,
+                             (si16)0x8880, 0x4440, 0x2220, 0x1110,
+                             (si16)0x8880, 0x4440, 0x2220, 0x1110));
+        insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
+        if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
+        {
+            ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
+            __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
+                _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
+            flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
+
+            // U_q holds U_q for this quad
+            // flags has e_k, e_1, and rho such that e_k is sitting in the
+            // 0x8000, e_1 in 0x800, and rho in 0x80
+
+            // next e_k and m_n
+            __m256i m_n;
+            w0 = _mm256_srli_epi16(flags, 15); // e_k
+            m_n = _mm256_sub_epi16(U_q_avx, w0);
+            m_n = _mm256_andnot_si256(insig, m_n);
+
+            // find cumulative sums
+            // to find at which bit in ms_vec the sample starts
+            __m256i inc_sum = m_n; // inclusive scan
+            inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
+            inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
+            inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
+            int total_mn1 = _mm256_extract_epi16(inc_sum, 7);
+            int total_mn2 = _mm256_extract_epi16(inc_sum, 15);
+            __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2); // exclusive scan
+
+            __m128i ms_vec0 = _mm_setzero_si128();
+            __m128i ms_vec1 = _mm_setzero_si128();
+            if (total_mn1) {
+                ms_vec0 = frwd_fetch<0xFF>(magsgn);
+                frwd_advance(magsgn, (ui32)total_mn1);
+            }
+            if (total_mn2) {
+                ms_vec1 = frwd_fetch<0xFF>(magsgn);
+                frwd_advance(magsgn, (ui32)total_mn2);
+            }
+
+            __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
+
+            // find the starting byte and starting bit
+            __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
+            __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
+            byte_idx = _mm256_shuffle_epi8(byte_idx,
+                _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
+                    0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
+                    0x0606, 0x0404, 0x0202, 0x0000));
+            byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
+            __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
+            byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
+            __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
+
+            // shift samples values to correct location
+            __m256i bit_shift = _mm256_shuffle_epi8(
+                _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
+                    1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
+                    1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
+            bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
+            d0 = _mm256_mullo_epi16(d0, bit_shift);
+            d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
+            d1 = _mm256_mullo_epi16(d1, bit_shift);
+            d1 = _mm256_and_si256(d1, _mm256_set1_epi16((si16)0xFF00)); // 8 in MSB
+            d0 = _mm256_or_si256(d0, d1);
+
+            // find location of e_k and mask
+            __m256i shift, t0, t1, Uq0, Uq1;
+            __m256i ones = _mm256_set1_epi16(1);
+            __m256i twos = _mm256_set1_epi16(2);
+            __m256i U_q_m1 = _mm256_sub_epi32(U_q_avx, ones);
+            Uq0 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
+            Uq1 = _mm256_bsrli_epi128(U_q_m1, 14);
+            w0 = _mm256_sub_epi16(twos, w0);
+            t0 = _mm256_and_si256(w0, _mm256_set_epi64x(0, -1, 0, -1));
+            t1 = _mm256_and_si256(w0, _mm256_set_epi64x(-1, 0, -1, 0));
+            {//no _mm256_sllv_epi16 in avx2
+                __m128i t_0_sse = _mm256_castsi256_si128(t0);
+                t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq0));
+                __m128i t_1_sse = _mm256_extracti128_si256(t0 , 0x1);
+                t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq0, 0x1));
+                t0 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
+
+                t_0_sse = _mm256_castsi256_si128(t1);
+                t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq1));
+                t_1_sse = _mm256_extracti128_si256(t1, 0x1);
+                t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq1, 0x1));
+                t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1);
+            }
+            shift = _mm256_or_si256(t0, t1);
+            ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
+
+            // next e_1
+            w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
+            w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
+            w0 = _mm256_andnot_si256(w0, shift);  // e_1 in correct position
+            ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
+            w0 = _mm256_slli_epi16(ms_vec, 15);   // sign
+            ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
+            __m256i tvn = ms_vec;
+            ms_vec = _mm256_add_epi16(ms_vec, twos);// + 2
+            ms_vec = _mm256_slli_epi16(ms_vec, (si32)p - 1);
+            ms_vec = _mm256_or_si256(ms_vec, w0); // sign
+            row = _mm256_andnot_si256(insig, ms_vec); // significant only
+
+            ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
+
+            __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
+                _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
+            __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
+                _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
+            ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
+
+            vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
+            vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
+        }
+        return row;
+    }
+
+    // https://stackoverflow.com/a/58827596
+    inline __m256i avx2_lzcnt_epi32(__m256i v) {
+        // prevent value from being rounded up to the next power of two
+        v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);  // keep 8 MSB
+
+        v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));    // convert an integer to float
+        v = _mm256_srli_epi32(v, 23);                   // shift down the exponent
+        v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);  // undo bias
+        v = _mm256_min_epi16(v, _mm256_set1_epi32(32));    // clamp at 32
+
+        return v;
+    }
+
+    //************************************************************************/
+    /** @brief Decodes one codeblock, processing the cleanup, siginificance
+     *         propagation, and magnitude refinement pass
+     *
+     *  @param [in]   coded_data is a pointer to bitstream
+     *  @param [in]   decoded_data is a pointer to decoded codeblock data buf.
+     *  @param [in]   missing_msbs is the number of missing MSBs
+     *  @param [in]   num_passes is the number of passes: 1 if CUP only,
+     *                2 for CUP+SPP, and 3 for CUP+SPP+MRP
+     *  @param [in]   lengths1 is the length of cleanup pass
+     *  @param [in]   lengths2 is the length of refinement passes (either SPP
+     *                only or SPP+MRP)
+     *  @param [in]   width is the decoded codeblock width
+     *  @param [in]   height is the decoded codeblock height
+     *  @param [in]   stride is the decoded codeblock buffer stride
+     *  @param [in]   stripe_causal is true for stripe causal mode
+     */
+    bool ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data,
+                                    ui32 missing_msbs, ui32 num_passes,
+                                    ui32 lengths1, ui32 lengths2,
+                                    ui32 width, ui32 height, ui32 stride,
+                                    bool stripe_causal)
+    {
+      static bool insufficient_precision = false;
+      static bool modify_code = false;
+      static bool truncate_spp_mrp = false;
+
+      if (num_passes > 1 && lengths2 == 0)
+      {
+        OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
+                              "one coding pass, but zero length for "
+                              "2nd and potential 3rd pass.");
+        num_passes = 1;
+      }
+
+      if (num_passes > 3)
+      {
+        OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
+                              "This codeblocks has %d passes.",
+                              num_passes);
+        return false;
+      }
+
+      if (missing_msbs > 30) // p < 0
+      {
+        if (insufficient_precision == false)
+        {
+          insufficient_precision = true;
+          OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
+                                "codeblock. This message will not be "
+                                "displayed again.");
+        }
+        return false;
+      }
+      else if (missing_msbs == 30) // p == 0
+      { // not enough precision to decode and set the bin center to 1
+        if (modify_code == false) {
+          modify_code = true;
+          OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
+                                "pass. The code can be modified to support "
+                                "this case. This message will not be "
+                                "displayed again.");
+        }
+         return false;         // 32 bits are not enough to decode this
+       }
+      else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
+      {
+        if (num_passes > 1) {
+          num_passes = 1;
+          if (truncate_spp_mrp == false) {
+            truncate_spp_mrp = true;
+            OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
+                                  "nor MagRef passes; both will be skipped. "
+                                  "This message will not be displayed "
+                                  "again.");
+          }
+        }
+      }
+      ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
+      // There is a way to handle the case of p == 0, but a different path
+      // is required
+
+      if (lengths1 < 2)
+      {
+        OJPH_WARN(0x00010006, "Wrong codeblock length.");
+        return false;
+      }
+
+      // read scup and fix the bytes there
+      int lcup, scup;
+      lcup = (int)lengths1;  // length of CUP
+      //scup is the length of MEL + VLC
+      scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
+      if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
+        return false;
+
+      // The temporary storage scratch holds two types of data in an
+      // interleaved fashion. The interleaving allows us to use one
+      // memory pointer.
+      // We have one entry for a decoded VLC code, and one entry for UVLC.
+      // Entries are 16 bits each, corresponding to one quad,
+      // but since we want to use XMM registers of the SSE family
+      // of SIMD; we allocated 16 bytes or more per quad row; that is,
+      // the width is no smaller than 16 bytes (or 8 entries), and the
+      // height is 512 quads
+      // Each VLC entry contains, in the following order, starting
+      // from MSB
+      // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
+      // Each entry in UVLC contains u_q
+      // One extra row to handle the case of SPP propagating downwards
+      // when codeblock width is 4
+      ui16 scratch[8 * 513] = {0};          // 8+ kB
+
+      // We need an extra two entries (one inf and one u_q) beyond
+      // the last column.
+      // If the block width is 4 (2 quads), then we use sstr of 8
+      // (enough for 4 quads). If width is 8 (4 quads) we use
+      // sstr is 16 (enough for 8 quads). For a width of 16 (8
+      // quads), we use 24 (enough for 12 quads).
+      ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
+
+      assert((stride & 0x3) == 0);
+
+      ui32 mmsbp2 = missing_msbs + 2;
+
+      // The cleanup pass is decoded in two steps; in step one,
+      // the VLC and MEL segments are decoded, generating a record that
+      // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
+      // This information should be sufficient for the next step.
+      // In step 2, we decode the MagSgn segment.
+
+      // step 1 decoding VLC and MEL segments
+      {
+        // init structures
+        dec_mel_st mel;
+        mel_init(&mel, coded_data, lcup, scup);
+        rev_struct vlc;
+        rev_init(&vlc, coded_data, lcup, scup);
+
+        int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
+                                     // data represented as runs of 0 events
+                                     // See mel_decode description
+
+        ui32 vlc_val;
+        ui32 c_q = 0;
+        ui16 *sp = scratch;
+        //initial quad row
+        for (ui32 x = 0; x < width; sp += 4)
+        {
+          // decode VLC
+          /////////////
+
+          // first quad
+          vlc_val = rev_fetch(&vlc);
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
+
+          // if context is zero, use one MEL event
+          if (c_q == 0) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // Is the run terminated in 1? if so, use decoded VLC code,
+            // otherwise, discard decoded data, since we will decoded again
+            // using a different context
+            t0 = (run == -1) ? t0 : 0;
+
+            // is run -1 or -2? this means a run has been consumed
+            if (run < 0)
+              run = mel_get_run(&mel);  // get another run
+          }
+          //run -= (c_q == 0) ? 2 : 0;
+          //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[0] = t0;
+          x += 2;
+
+          // prepare context for the next quad; eqn. 1 in ITU T.814
+          c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
+
+          //remove data from vlc stream (0 bits are removed if vlc is not used)
+          vlc_val = rev_advance(&vlc, t0 & 0x7);
+
+          //second quad
+          ui16 t1 = 0;
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
+
+          // if context is zero, use one MEL event
+          if (c_q == 0 && x < width) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // if event is 0, discard decoded t1
+            t1 = (run == -1) ? t1 : 0;
+
+            if (run < 0) // have we consumed all events in a run
+              run = mel_get_run(&mel); // if yes, then get another run
+          }
+          t1 = x < width ? t1 : 0;
+          //run -= (c_q == 0 && x < width) ? 2 : 0;
+          //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[2] = t1;
+          x += 2;
+
+          //prepare context for the next quad, eqn. 1 in ITU T.814
+          c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
+
+          //remove data from vlc stream, if qinf is not used, cwdlen is 0
+          vlc_val = rev_advance(&vlc, t1 & 0x7);
+
+          // decode u
+          /////////////
+          // uvlc_mode is made up of u_offset bits from the quad pair
+          ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+          if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
+          {                     // the MEL run of events
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
+                                                 // is 0x40
+
+            if (run < 0)//if run is consumed (run is -1 or -2), get another run
+              run = mel_get_run(&mel);
+          }
+          //run -= (uvlc_mode == 0xc0) ? 2 : 0;
+          //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+
+          //decode uvlc_mode to get u for both quads
+          ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
+          //remove total prefix length
+          vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
+          uvlc_entry >>= 3;
+          //extract suffixes for quad 0 and 1
+          ui32 len = uvlc_entry & 0xF;           //suffix length for 2 quads
+          ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
+          vlc_val = rev_advance(&vlc, len);
+          ojph_unused(vlc_val); //static code analysis: unused value
+          uvlc_entry >>= 4;
+          // quad 0 length
+          len = uvlc_entry & 0x7; // quad 0 suffix length
+          uvlc_entry >>= 3;
+          ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
+          sp[1] = u_q;
+          u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len));  //kappa == 1
+          sp[3] = u_q;
+        }
+        sp[0] = sp[1] = 0;
+
+        //non initial quad rows
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          c_q = 0;                                // context
+          ui16 *sp = scratch + (y >> 1) * sstr;   // this row of quads
+
+          for (ui32 x = 0; x < width; sp += 4)
+          {
+            // decode VLC
+            /////////////
+
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
+
+            // first quad
+            vlc_val = rev_fetch(&vlc);
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0) //zero context
+            {
+              run -= 2; //subtract 2, since events number is multiplied by 2
+
+              // Is the run terminated in 1? if so, use decoded VLC code,
+              // otherwise, discard decoded data, since we will decoded again
+              // using a different context
+              t0 = (run == -1) ? t0 : 0;
+
+              // is run -1 or -2? this means a run has been consumed
+              if (run < 0)
+                run = mel_get_run(&mel);  // get another run
+            }
+            //run -= (c_q == 0) ? 2 : 0;
+            //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[0] = t0;
+            x += 2;
+
+            // prepare context for the next quad; eqn. 2 in ITU T.814
+            // sigma_q (w, sw)
+            c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[0 - (si32)sstr] & 0x80;
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
+
+            //remove data from vlc stream (0 bits are removed if vlc is unused)
+            vlc_val = rev_advance(&vlc, t0 & 0x7);
+
+            //second quad
+            ui16 t1 = 0;
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0 && x < width) //zero context
+            {
+              run -= 2; //subtract 2, since events number if multiplied by 2
+
+              // if event is 0, discard decoded t1
+              t1 = (run == -1) ? t1 : 0;
+
+              if (run < 0) // have we consumed all events in a run
+                run = mel_get_run(&mel); // if yes, then get another run
+            }
+            t1 = x < width ? t1 : 0;
+            //run -= (c_q == 0 && x < width) ? 2 : 0;
+            //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[2] = t1;
+            x += 2;
+
+            // partial c_q, will be completed when we process the next quad
+            // sigma_q (w, sw)
+            c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[2 - (si32)sstr] & 0x80;
+
+            //remove data from vlc stream, if qinf is not used, cwdlen is 0
+            vlc_val = rev_advance(&vlc, t1 & 0x7);
+
+            // decode u
+            /////////////
+            // uvlc_mode is made up of u_offset bits from the quad pair
+            ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+            ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
+            //remove total prefix length
+            vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
+            uvlc_entry >>= 3;
+            //extract suffixes for quad 0 and 1
+            ui32 len = uvlc_entry & 0xF;           //suffix length for 2 quads
+            ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
+            vlc_val = rev_advance(&vlc, len);
+            ojph_unused(vlc_val); //static code analysis: unused value
+            uvlc_entry >>= 4;
+            // quad 0 length
+            len = uvlc_entry & 0x7; // quad 0 suffix length
+            uvlc_entry >>= 3;
+            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
+            sp[1] = u_q;
+            u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
+            sp[3] = u_q;
+          }
+          sp[0] = sp[1] = 0;
+        }
+      }
+
+      // step2 we decode magsgn
+      // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
+      // The 32 bit path decode 16 bits data, for which one would think
+      // 16 bits are enough, because we want to put in the center of the
+      // bin.
+      // If you have mmsbp2 equals 16 bit, and reversible coding, and
+      // no bitplanes are missing, then we can decoding using the 16 bit
+      // path, but we are not doing this here.
+      if (mmsbp2 >= 16)
+      {
+        // We allocate a scratch row for storing v_n values.
+        // We have 512 quads horizontally.
+        // We may go beyond the last entry by up to 4 entries.
+        // Here we allocate additional 8 entries.
+        // There are two rows in this structure, the bottom
+        // row is used to store processed entries.
+        const int v_n_size = 512 + 16;
+        ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
+
+        frwd_struct magsgn;
+        frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
+
+        const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
+
+        {
+          ui16 *sp = scratch;
+          ui32 *vp = v_n_scratch;
+          ui32 *dp = decoded_data;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
+          {
+            __m128i vn = _mm_set1_epi32(2);
+
+            __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
+            inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+
+            __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
+            __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
+            if (!_mm256_testz_si256(w, w)) {
+                return false;
+            }
+
+            __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn);
+            row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
+            _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
+            _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
+
+            __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
+            w0 = _mm_or_si128(w0, vn);
+            _mm_storeu_si128((__m128i*)vp, w0);
+          }
+        }
+
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          {
+            // perform 31 - count_leading_zeros(*vp) here
+            ui32 *vp = v_n_scratch;
+            ui16* sp = scratch + (y >> 1) * sstr;
+
+            const __m256i avx_31 = _mm256_set1_epi32(31);
+            const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
+            const __m256i avx_1 = _mm256_set1_epi32(1);
+            const __m256i avx_0 = _mm256_setzero_si256();
+
+            for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
+              __m256i v = _mm256_loadu_si256((__m256i*)vp);
+              __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
+              v = _mm256_or_si256(v, v_p1);
+              v = avx2_lzcnt_epi32(v);
+              v = _mm256_sub_epi32(avx_31, v);
+
+              __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
+              __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
+              __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
+              gamma = _mm256_and_si256(gamma, w0);
+              gamma = _mm256_cmpeq_epi32(gamma, avx_0);
+
+              v = _mm256_andnot_si256(gamma, v);
+              v = _mm256_max_epi32(v, avx_1);
+
+              inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
+              v = _mm256_add_epi32(inf_u_q, v);
+
+              w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
+              if (!_mm256_testz_si256(w0, w0)) {
+                  return false;
+              }
+
+              _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
+            }
+          }
+
+          ui32 *vp = v_n_scratch;
+          ui16 *sp = scratch + (y >> 1) * sstr;
+          ui32 *dp = decoded_data + y * stride;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
+            //process two quads
+            __m128i vn = _mm_set1_epi32(2);
+
+            __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
+            inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+
+            __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
+            U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
+
+            __m256i row = decode_two_quad32_avx2(inf_u_q, U_q,  &magsgn, p, vn);
+            row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
+            _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
+            _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
+
+            __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
+            w0 = _mm_or_si128(w0, vn);
+            _mm_storeu_si128((__m128i*)vp, w0);
+          }
+        }
+      }
+      else {
+
+        // reduce bitplane by 16 because we now have 16 bits instead of 32
+        p -= 16;
+
+        // We allocate a scratch row for storing v_n values.
+        // We have 512 quads horizontally.
+        // We may go beyond the last entry by up to 8 entries.
+        // Therefore we allocate additional 8 entries.
+        // There are two rows in this structure, the bottom
+        // row is used to store processed entries.
+        const int v_n_size = 512 + 16;
+        ui16 v_n_scratch[v_n_size] = {0}; // 1+ kB
+        ui32 v_n_scratch_32[v_n_size] = {0}; // 2+ kB
+
+        frwd_struct magsgn;
+        frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
+
+        {
+          ui16 *sp = scratch;
+          ui16 *vp = v_n_scratch;
+          ui32 *dp = decoded_data;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
+              ////process four quads
+              __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
+              __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
+              __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
+              if (!_mm_testz_si128(w, w)) {
+                  return false;
+              }
+
+              __m128i vn = _mm_set1_epi16(2);
+              __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
+
+              w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
+              _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
+
+              __m256i  w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
+              __m256i  w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
+
+              _mm256_storeu_si256((__m256i*)dp, w0);
+              _mm256_storeu_si256((__m256i*)(dp + stride), w1);
+          }
+        }
+
+        for (ui32 y = 2; y < height; y += 2) {
+          {
+            // perform 15 - count_leading_zeros(*vp) here
+            ui16 *vp = v_n_scratch;
+            ui32 *vp_32 = v_n_scratch_32;
+
+            ui16* sp = scratch + (y >> 1) * sstr;
+            const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
+            const __m256i avx_31 = _mm256_set1_epi32(31);
+            const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
+            const __m256i avx_1 = _mm256_set1_epi32(1);
+            const __m256i avx_0 = _mm256_setzero_si256();
+
+            for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
+              __m128i v = _mm_loadu_si128((__m128i*)vp);
+              __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
+              v = _mm_or_si128(v, v_p1);
+
+              __m256i v_avx = _mm256_cvtepu16_epi32(v);
+              v_avx = avx2_lzcnt_epi32(v_avx);
+              v_avx = _mm256_sub_epi32(avx_31, v_avx);
+
+              __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
+              __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
+              __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
+              gamma = _mm256_and_si256(gamma, w0);
+              gamma = _mm256_cmpeq_epi32(gamma, avx_0);
+
+              v_avx = _mm256_andnot_si256(gamma, v_avx);
+              v_avx = _mm256_max_epi32(v_avx, avx_1);
+
+              inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
+              v_avx = _mm256_add_epi32(inf_u_q, v_avx);
+
+              w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
+              if (!_mm256_testz_si256(w0, w0)) {
+                  return false;
+              }
+
+              _mm256_storeu_si256((__m256i*)vp_32, v_avx);
+            }
+          }
+
+          ui16 *vp = v_n_scratch;
+          ui32* vp_32 = v_n_scratch_32;
+          ui16 *sp = scratch + (y >> 1) * sstr;
+          ui32 *dp = decoded_data + y * stride;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
+            ////process four quads
+              __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
+              __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
+
+            __m128i vn = _mm_set1_epi16(2);
+            __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn);
+
+            __m128i w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
+            _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
+
+            __m256i  w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
+            __m256i  w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
+
+            _mm256_storeu_si256((__m256i*)dp, w0);
+            _mm256_storeu_si256((__m256i*)(dp + stride), w1);
+          }
+        }
+
+        // increase bitplane back by 16 because we need to process 32 bits
+        p += 16;
+      }
+
+      if (num_passes > 1)
+      {
+        // We use scratch again, we can divide it into multiple regions
+        // sigma holds all the significant samples, and it cannot
+        // be modified after it is set.  it will be used during the
+        // Magnitude Refinement Pass
+        ui16* const sigma = scratch;
+
+        ui32 mstr = (width + 3u) >> 2;   // divide by 4, since each
+                                         // ui16 contains 4 columns
+        mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
+
+        // We re-arrange quad significance, where each 4 consecutive
+        // bits represent one quad, into column significance, where,
+        // each 4 consequtive bits represent one column of 4 rows
+        {
+          ui32 y;
+
+          const __m128i mask_3 = _mm_set1_epi32(0x30);
+          const __m128i mask_C = _mm_set1_epi32(0xC0);
+          const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
+          for (y = 0; y < height; y += 4)
+          {
+            ui16* sp = scratch + (y >> 1) * sstr;
+            ui16* dp = sigma + (y >> 2) * mstr;
+            for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
+            {
+              __m128i s0, s1, u3, uC, t0, t1;
+
+              s0 = _mm_loadu_si128((__m128i*)(sp));
+              u3 = _mm_and_si128(s0, mask_3);
+              u3 = _mm_srli_epi32(u3, 4);
+              uC = _mm_and_si128(s0, mask_C);
+              uC = _mm_srli_epi32(uC, 2);
+              t0 = _mm_or_si128(u3, uC);
+
+              s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
+              u3 = _mm_and_si128(s1, mask_3);
+              u3 = _mm_srli_epi32(u3, 2);
+              uC = _mm_and_si128(s1, mask_C);
+              t1 = _mm_or_si128(u3, uC);
+
+              __m128i r = _mm_or_si128(t0, t1);
+              r = _mm_shuffle_epi8(r, shuffle_mask);
+
+              // _mm_storeu_si32 is not defined, so we use this workaround
+              _mm_store_ss((float*)dp, _mm_castsi128_ps(r));
+            }
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+          {
+            // reset one row after the codeblock
+            ui16* dp = sigma + (y >> 2) * mstr;
+            __m128i zero = _mm_setzero_si128();
+            for (ui32 x = 0; x < width; x += 32, dp += 8)
+              _mm_store_si128((__m128i*)dp, zero);
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+        }
+
+        // We perform Significance Propagation Pass here
+        {
+          // This stores significance information of the previous
+          // 4 rows.  Significance information in this array includes
+          // all signicant samples in bitplane p - 1; that is,
+          // significant samples for bitplane p (discovered during the
+          // cleanup pass and stored in sigma) and samples that have recently
+          // became significant (during the SPP) in bitplane p-1.
+          // We store enough for the widest row, containing 1024 columns,
+          // which is equivalent to 256 of ui16, since each stores 4 columns.
+          // We add an extra 8 entries, just in case we need more
+          ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
+
+          frwd_struct sigprop;
+          frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui32 pattern = 0xFFFFu; // a pattern needed samples
+            if (height - y < 4) {
+              pattern = 0x7777u;
+              if (height - y < 3) {
+                pattern = 0x3333u;
+                if (height - y < 2)
+                  pattern = 0x1111u;
+              }
+            }
+
+            // prev holds sign. info. for the previous quad, together
+            // with the rows on top of it and below it.
+            ui32 prev = 0;
+            ui16 *prev_sig = prev_row_sig;
+            ui16 *cur_sig = sigma + (y >> 2) * mstr;
+            ui32 *dpp = decoded_data + y * stride;
+            for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
+            {
+              // only rows and columns inside the stripe are included
+              si32 s = (si32)x + 4 - (si32)width;
+              s = ojph_max(s, 0);
+              pattern = pattern >> (s * 4);
+
+              // We first find locations that need to be tested (potential
+              // SPP members); these location will end up in mbr
+              // In each iteration, we produce 16 bits because cwd can have
+              // up to 16 bits of significance information, followed by the
+              // corresponding 16 bits of sign information; therefore, it is
+              // sufficient to fetch 32 bit data per loop.
+
+              // Althougth we are interested in 16 bits only, we load 32 bits.
+              // For the 16 bits we are producing, we need the next 4 bits --
+              // We need data for at least 5 columns out of 8.
+              // Therefore loading 32 bits is easier than loading 16 bits
+              // twice.
+              ui32 ps = *(ui32*)prev_sig;
+              ui32 ns = *(ui32*)(cur_sig + mstr);
+              ui32 u = (ps & 0x88888888) >> 3; // the row on top
+              if (!stripe_causal)
+                u |= (ns & 0x11111111) << 3;   // the row below
+
+              ui32 cs = *(ui32*)cur_sig;
+              // vertical integration
+              ui32 mbr =  cs;                // this sig. info.
+              mbr |= (cs & 0x77777777) << 1; //above neighbors
+              mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
+              mbr |= u;
+              // horizontal integration
+              ui32 t = mbr;
+              mbr |= t << 4;      // neighbors on the left
+              mbr |= t >> 4;      // neighbors on the right
+              mbr |= prev >> 12;  // significance of previous group
+
+              // remove outside samples, and already significant samples
+              mbr &= pattern;
+              mbr &= ~cs;
+
+              // find samples that become significant during the SPP
+              ui32 new_sig = mbr;
+              if (new_sig)
+              {
+                __m128i cwd_vec = frwd_fetch<0>(&sigprop);
+                ui32 cwd = (ui32)_mm_extract_epi16(cwd_vec, 0);
+
+                ui32 cnt = 0;
+                ui32 col_mask = 0xFu;
+                ui32 inv_sig = ~cs & pattern;
+                for (int i = 0; i < 16; i += 4, col_mask <<= 4)
+                {
+                  if ((col_mask & new_sig) == 0)
+                    continue;
+
+                  //scan one column
+                  ui32 sample_mask = 0x1111u & col_mask;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x33u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x76u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xECu << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xC8u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+                }
+
+                if (new_sig)
+                {
+                  cwd |= (ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
+
+                  // Spread new_sig, such that each bit is in one byte with a
+                  // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
+                  __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig);
+                  new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
+                    _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
+                  new_sig_vec = _mm_and_si128(new_sig_vec,
+                    _mm_set1_epi64x((si64)0x8040201008040201));
+                  new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
+                    _mm_set1_epi64x((si64)0x8040201008040201));
+
+                  // find cumulative sums
+                  // to find which bit in cwd we should extract
+                  __m128i inc_sum = new_sig_vec; // inclusive scan
+                  inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1
+                  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
+                  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
+                  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
+                  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
+                  cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
+                  // exclusive scan
+                  __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
+
+                  // Spread cwd, such that each bit is in one byte
+                  // with a value of 0 or 1.
+                  cwd_vec = _mm_set1_epi16((si16)cwd);
+                  cwd_vec = _mm_shuffle_epi8(cwd_vec,
+                    _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
+                  cwd_vec = _mm_and_si128(cwd_vec,
+                    _mm_set1_epi64x((si64)0x8040201008040201));
+                  cwd_vec = _mm_cmpeq_epi8(cwd_vec,
+                    _mm_set1_epi64x((si64)0x8040201008040201));
+                  cwd_vec = _mm_abs_epi8(cwd_vec);
+
+                  // Obtain bit from cwd_vec correspondig to ex_sum
+                  // Basically, collect needed bits from cwd_vec
+                  __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
+
+                  // load data and set spp coefficients
+                  __m128i m =
+                    _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
+                  __m128i val = _mm_set1_epi32(3 << (p - 2));
+                  ui32 *dp = dpp;
+                  for (int c = 0; c < 4; ++ c) {
+                    __m128i s0, s0_ns, s0_val;
+                    // load coefficients
+                    s0 = _mm_load_si128((__m128i*)dp);
+
+                    // epi32 is -1 only for coefficient that
+                    // are changed during the SPP
+                    s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
+                    s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
+
+                    // obtain sign for coefficients in SPP
+                    s0_val = _mm_shuffle_epi8(v, m);
+                    s0_val = _mm_slli_epi32(s0_val, 31);
+                    s0_val = _mm_or_si128(s0_val, val);
+                    s0_val = _mm_and_si128(s0_val, s0_ns);
+
+                    // update vector
+                    s0 = _mm_or_si128(s0, s0_val);
+                    // store coefficients
+                    _mm_store_si128((__m128i*)dp, s0);
+                    // prepare for next row
+                    dp += stride;
+                    m = _mm_add_epi32(m, _mm_set1_epi32(1));
+                  }
+                }
+                frwd_advance(&sigprop, cnt);
+              }
+
+              new_sig |= cs;
+              *prev_sig = (ui16)(new_sig);
+
+              // vertical integration for the new sig. info.
+              t = new_sig;
+              new_sig |= (t & 0x7777) << 1; //above neighbors
+              new_sig |= (t & 0xEEEE) >> 1; //below neighbors
+              // add sig. info. from the row on top and below
+              prev = new_sig | u;
+              // we need only the bits in 0xF000
+              prev &= 0xF000;
+            }
+          }
+        }
+
+        // We perform Magnitude Refinement Pass here
+        if (num_passes > 2)
+        {
+          rev_struct magref;
+          rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui16 *cur_sig = sigma + (y >> 2) * mstr;
+            ui32 *dpp = decoded_data + y * stride;
+            for (ui32 i = 0; i < width; i += 4, dpp += 4)
+            {
+              //Process one entry from sigma array at a time
+              // Each nibble (4 bits) in the sigma array represents 4 rows,
+              ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
+              ui16 sig = *cur_sig++; // 16 bit that will be processed now
+              int total_bits = 0;
+              if (sig) // if any of the 32 bits are set
+              {
+                // We work on 4 rows, with 4 samples each, since
+                // data is 32 bit (4 bytes)
+
+                // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
+                __m128i sig_vec = _mm_set1_epi16((si16)sig);
+                sig_vec = _mm_shuffle_epi8(sig_vec,
+                  _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
+                sig_vec = _mm_and_si128(sig_vec,
+                  _mm_set1_epi64x((si64)0x8040201008040201));
+                sig_vec = _mm_cmpeq_epi8(sig_vec,
+                  _mm_set1_epi64x((si64)0x8040201008040201));
+                sig_vec = _mm_abs_epi8(sig_vec);
+
+                // find cumulative sums
+                // to find which bit in cwd we should extract
+                __m128i inc_sum = sig_vec; // inclusive scan
+                inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
+                inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
+                inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
+                inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
+                total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
+                __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan
+
+                // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
+                // cwd_vec. Then, convert these to a form suitable
+                // for coefficient modifications; in particular, a value
+                // of 0 is presented as binary 11, and a value of 1 is
+                // represented as binary 01
+                __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
+                cwd_vec = _mm_shuffle_epi8(cwd_vec,
+                  _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
+                cwd_vec = _mm_and_si128(cwd_vec,
+                  _mm_set1_epi64x((si64)0x8040201008040201));
+                cwd_vec = _mm_cmpeq_epi8(cwd_vec,
+                  _mm_set1_epi64x((si64)0x8040201008040201));
+                cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
+                cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
+                cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
+
+                // load data and insert the mrp bit
+                __m128i m =
+                  _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
+                ui32 *dp = dpp;
+                for (int c = 0; c < 4; ++c) {
+                  __m128i s0, s0_sig, s0_idx, s0_val;
+                  // load coefficients
+                  s0 = _mm_load_si128((__m128i*)dp);
+                  // find significant samples in this row
+                  s0_sig = _mm_shuffle_epi8(sig_vec, m);
+                  s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
+                  // get MRP bit index, and MRP pattern
+                  s0_idx = _mm_shuffle_epi8(ex_sum, m);
+                  s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
+                  // keep data from significant samples only
+                  s0_val = _mm_andnot_si128(s0_sig, s0_val);
+                  // move mrp bits to correct position, and employ
+                  s0_val = _mm_slli_epi32(s0_val, (si32)p - 2);
+                  s0 = _mm_xor_si128(s0, s0_val);
+                  // store coefficients
+                  _mm_store_si128((__m128i*)dp, s0);
+                  // prepare for next row
+                  dp += stride;
+                  m = _mm_add_epi32(m, _mm_set1_epi32(1));
+                }
+              }
+              // consume data according to the number of bits set
+              rev_advance_mrp(&magref, (ui32)total_bits);
+            }
+          }
+        }
+      }
+
+      return true;
+    }
+  }
+}
diff --git a/src/core/coding/ojph_block_decoder_ssse3.cpp b/src/core/coding/ojph_block_decoder_ssse3.cpp
index 99ae38c..9fa5800 100644
--- a/src/core/coding/ojph_block_decoder_ssse3.cpp
+++ b/src/core/coding/ojph_block_decoder_ssse3.cpp
@@ -1033,14 +1033,14 @@ namespace ojph {
       {
         OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
                               "one coding pass, but zero length for "
-                              "2nd and potential 3rd pass");
+                              "2nd and potential 3rd pass.");
         num_passes = 1;
       }
 
       if (num_passes > 3)
       {
         OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
-                              "This codeblocks has %d passes",
+                              "This codeblocks has %d passes.",
                               num_passes);
         return false;
       }
@@ -1052,7 +1052,7 @@ namespace ojph {
           insufficient_precision = true;
           OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
                                 "codeblock. This message will not be "
-                                "displayed again");
+                                "displayed again.");
         }
         return false;
       }       
@@ -1063,7 +1063,7 @@ namespace ojph {
           OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
                                 "pass. The code can be modified to support "
                                 "this case. This message will not be "
-                                "displayed again");
+                                "displayed again.");
         }
          return false;         // 32 bits are not enough to decode this
        }
@@ -1076,7 +1076,7 @@ namespace ojph {
             OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
                                   "nor MagRef passes; both will be skipped. "
                                   "This message will not be displayed "
-                                  "again");
+                                  "again.");
           }
         }
       }
@@ -1086,7 +1086,7 @@ namespace ojph {
 
       if (lengths1 < 2)
       {
-        OJPH_WARN(0x00010006, "Wrong codeblock length");
+        OJPH_WARN(0x00010006, "Wrong codeblock length.");
         return false;
       }
 
@@ -1361,7 +1361,7 @@ namespace ojph {
             // quad 0 length
             len = uvlc_entry & 0x7; // quad 0 suffix length
             uvlc_entry >>= 3;
-            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
+            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
             sp[1] = u_q;
             u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
             sp[3] = u_q;
diff --git a/src/core/coding/ojph_block_encoder.cpp b/src/core/coding/ojph_block_encoder.cpp
index 2023ef1..ffc9e8d 100644
--- a/src/core/coding/ojph_block_encoder.cpp
+++ b/src/core/coding/ojph_block_encoder.cpp
@@ -65,11 +65,12 @@ namespace ojph {
     static ui16 vlc_tbl1[2048] = { 0 };
 
     //UVLC encoding
-    static int ulvc_cwd_pre[33];
-    static int ulvc_cwd_pre_len[33];
-    static int ulvc_cwd_suf[33];
-    static int ulvc_cwd_suf_len[33];
-
+    const int num_uvlc_entries = 75;
+    struct uvlc_tbl_struct {
+      ui8 pre, pre_len, suf, suf_len, ext, ext_len;
+    };
+    static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries];
+    
     /////////////////////////////////////////////////////////////////////////
     static bool vlc_init_tables()
     {
@@ -194,23 +195,61 @@ namespace ojph {
     static bool uvlc_init_tables()
     {
       //code goes from 0 to 31, extension and 32 are not supported here
-      ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
-      ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
-      ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
-      ulvc_cwd_pre_len[2] = 2;
-      ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
-      ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
-      ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
-      ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
-      ulvc_cwd_suf_len[2] = 0;
-      ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
+      uvlc_tbl[0].pre = 0;
+      uvlc_tbl[0].pre_len = 0;
+      uvlc_tbl[0].suf = 0;
+      uvlc_tbl[0].suf_len = 0;
+      uvlc_tbl[0].ext = 0;
+      uvlc_tbl[0].ext_len = 0;
+
+      uvlc_tbl[1].pre = 1;
+      uvlc_tbl[1].pre_len = 1;
+      uvlc_tbl[1].suf = 0;
+      uvlc_tbl[1].suf_len = 0;
+      uvlc_tbl[1].ext = 0;
+      uvlc_tbl[1].ext_len = 0;
+
+      uvlc_tbl[2].pre = 2;
+      uvlc_tbl[2].pre_len = 2;
+      uvlc_tbl[2].suf = 0;
+      uvlc_tbl[2].suf_len = 0;
+      uvlc_tbl[2].ext = 0;
+      uvlc_tbl[2].ext_len = 0;
+
+      uvlc_tbl[3].pre = 4;
+      uvlc_tbl[3].pre_len = 3;
+      uvlc_tbl[3].suf = 0;
+      uvlc_tbl[3].suf_len = 1;
+      uvlc_tbl[3].ext = 0;
+      uvlc_tbl[3].ext_len = 0;
+
+      uvlc_tbl[4].pre = 4;
+      uvlc_tbl[4].pre_len = 3;
+      uvlc_tbl[4].suf = 1;
+      uvlc_tbl[4].suf_len = 1;
+      uvlc_tbl[4].ext = 0;
+      uvlc_tbl[4].ext_len = 0;
+
       for (int i = 5; i < 33; ++i)
       {
-        ulvc_cwd_pre[i] = 0;
-        ulvc_cwd_pre_len[i] = 3;
-        ulvc_cwd_suf[i] = i-5;
-        ulvc_cwd_suf_len[i] = 5;
+        uvlc_tbl[i].pre = 0;
+        uvlc_tbl[i].pre_len = 3;
+        uvlc_tbl[i].suf = (ui8)(i - 5);
+        uvlc_tbl[i].suf_len = 5;
+        uvlc_tbl[i].ext = 0;
+        uvlc_tbl[i].ext_len = 0;
       }
+
+      for (int i = 33; i < num_uvlc_entries; ++i)
+      {
+        uvlc_tbl[i].pre = 0;
+        uvlc_tbl[i].pre_len = 3;
+        uvlc_tbl[i].suf = (ui8)(28 + (i - 33) % 4);
+        uvlc_tbl[i].suf_len = 5;
+        uvlc_tbl[i].ext = (ui8)((i - 33) / 4);
+        uvlc_tbl[i].ext_len = 4;
+      }
+
       return true;
     }
 
@@ -440,6 +479,29 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    ms_encode64(ms_struct* msp, ui64 cwd, int cwd_len)
+    {
+      while (cwd_len > 0)
+      {
+        if (msp->pos >= msp->buf_size)
+          OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
+        int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
+        msp->tmp |= (ui32)((cwd & ((1ULL << t) - 1)) << msp->used_bits);
+        msp->used_bits += t;
+        cwd >>= t;
+        cwd_len -= t;
+        if (msp->used_bits >= msp->max_bits)
+        {
+          msp->buf[msp->pos++] = (ui8)msp->tmp;
+          msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
+          msp->tmp = 0;
+          msp->used_bits = 0;
+        }
+      }
+    }    
+
     //////////////////////////////////////////////////////////////////////////
     static inline void
     ms_terminate(ms_struct* msp)
@@ -467,11 +529,11 @@ namespace ojph {
     //
     //
     //////////////////////////////////////////////////////////////////////////
-    void ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes,
-                               ui32 width, ui32 height, ui32 stride,
-                               ui32* lengths,
-                               ojph::mem_elastic_allocator *elastic,
-                               ojph::coded_lists *& coded)
+    void ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 ui32* lengths,
+                                 ojph::mem_elastic_allocator *elastic,
+                                 ojph::coded_lists *& coded)
     {
       assert(num_passes == 1);
       (void)num_passes;                      //currently not used
@@ -693,23 +755,23 @@ namespace ojph {
 
         if (u_q0 > 2 && u_q1 > 2)
         {
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0-2], ulvc_cwd_pre_len[u_q0-2]);
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q1-2], ulvc_cwd_pre_len[u_q1-2]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0-2], ulvc_cwd_suf_len[u_q0-2]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q1-2], ulvc_cwd_suf_len[u_q1-2]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len);
         }
         else if (u_q0 > 2 && u_q1 > 0)
         {
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
           vlc_encode(&vlc, u_q1 - 1, 1);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
         }
         else
         {
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q1], ulvc_cwd_pre_len[u_q1]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q1], ulvc_cwd_suf_len[u_q1]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
         }
 
         //prepare for next iteration
@@ -910,10 +972,514 @@ namespace ojph {
             ms_encode(&ms, s[7] & ((1U<<m)-1), m);
           }
 
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q1], ulvc_cwd_pre_len[u_q1]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q1], ulvc_cwd_suf_len[u_q1]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
+
+          //prepare for next iteration
+          c_q0 |= ((rho[1] & 4) >> 1) | ((rho[1] & 8) >> 2);
+          s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0;
+          e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0;
+          rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0;
+        }
+      }
+
+
+      terminate_mel_vlc(&mel, &vlc);
+      ms_terminate(&ms);
+
+      //copy to elastic
+      lengths[0] = mel.pos + vlc.pos + ms.pos;
+      elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
+      memcpy(coded->buf, ms.buf, ms.pos);
+      memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
+      memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
+
+      // put in the interface locator word
+      ui32 num_bytes = mel.pos + vlc.pos;
+      coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
+      coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
+      coded->buf[lengths[0]-2] = 
+        (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
+
+      coded->avail_size -= lengths[0];
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+    void ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 ui32* lengths,
+                                 ojph::mem_elastic_allocator *elastic,
+                                 ojph::coded_lists *& coded)
+    {
+      assert(num_passes == 1);
+      (void)num_passes;                      //currently not used
+      // 38 bits/sample + 1 color + 4 wavelet = 43 bits per sample.
+      // * 4096 samples / 8 bits per byte = 22016; then rounded up to the 
+      // nearest 1 kB, givin 22528.  This expanded further to take into 
+      // consideration stuffing at a max rate of 16 bits per 15 bits 
+      // (1 bit for every 15 bits of data); in reality, it is much smaller
+      // than this.
+      const int ms_size = (22528 * 16 + 14) / 15;  //more than enough
+      ui8 ms_buf[ms_size];
+      // For each quad, we need at most, 7 bits for VLC and 12 bits for UVLC.
+      // So we have 1024 quads * 19 / 8, which is 2432.  This must be 
+      // multiplied by 16 / 15 to accommodate stuffing.  
+      // The mel is at most around 1 bit/quad, giving around 128 byte -- in
+      // practice there was on case where it got to 132 bytes.  Even 
+      // accounting for stuffing, it is smaller than 192.  Therefore,
+      // 3072 is more than enough
+      const int mel_vlc_size = 3072;         //more than enough
+      ui8 mel_vlc_buf[mel_vlc_size];
+      const int mel_size = 192;
+      ui8 *mel_buf = mel_vlc_buf;
+      const int vlc_size = mel_vlc_size - mel_size;
+      ui8 *vlc_buf = mel_vlc_buf + mel_size;
+
+      mel_struct mel;
+      mel_init(&mel, mel_size, mel_buf);
+      vlc_struct vlc;
+      vlc_init(&vlc, vlc_size, vlc_buf);
+      ms_struct ms;
+      ms_init(&ms, ms_size, ms_buf);
+
+      ui32 p = 62 - missing_msbs;
+
+      //e_val: E values for a line (these are the highest set bit)
+      //cx_val: is the context values
+      //Each byte stores the info for the 2 sample. For E, it is maximum
+      // of the two samples, while for cx, it is the OR of these two samples.
+      //The maximum is between the pixel at the bottom left of one quad
+      // and the bottom right of the earlier quad. The same is true for cx.
+      //For a 1024 pixels, we need 512 bytes, the 2 extra,
+      // one for the non-existing earlier quad, and one for beyond the
+      // the end
+      ui8 e_val[513];
+      ui8 cx_val[513];
+      ui8* lep = e_val;     lep[0] = 0;
+      ui8* lcxp = cx_val;   lcxp[0] = 0;
+
+      //initial row of quads
+      int e_qmax[2] = {0,0}, e_q[8] = {0,0,0,0,0,0,0,0};
+      int rho[2] = {0,0};
+      int c_q0 = 0;
+      ui64 s[8] = {0,0,0,0,0,0,0,0}, val, t;
+      ui32 y = 0;
+      ui64 *sp = buf;
+      for (ui32 x = 0; x < width; x += 4)
+      {
+        //prepare two quads
+        t = sp[0];
+        val = t + t; //multiply by 2 and get rid of sign
+        val >>= p;  // 2 \mu_p + x
+        val &= ~1ULL; // 2 \mu_p
+        if (val)
+        {
+          rho[0] = 1;
+          e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+          e_qmax[0] = e_q[0];
+          s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+        }
+
+        t = height > 1 ? sp[stride] : 0;
+        ++sp;
+        val = t + t; //multiply by 2 and get rid of sign
+        val >>= p; // 2 \mu_p + x
+        val &= ~1ULL;// 2 \mu_p
+        if (val)
+        {
+          rho[0] += 2;
+          e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+          e_qmax[0] = ojph_max(e_qmax[0], e_q[1]);
+          s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+        }
+
+        if (x + 1 < width)
+        {
+          t = sp[0];
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] += 4;
+            e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = ojph_max(e_qmax[0], e_q[2]);
+            s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          t = height > 1 ? sp[stride] : 0;
+          ++sp;
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] += 8;
+            e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = ojph_max(e_qmax[0], e_q[3]);
+            s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+        }
+
+        int Uq0 = ojph_max(e_qmax[0], 1); //kappa_q = 1
+        int u_q0 = Uq0 - 1, u_q1 = 0; //kappa_q = 1
+
+        int eps0 = 0;
+        if (u_q0 > 0)
+        {
+          eps0 |= (e_q[0] == e_qmax[0]);
+          eps0 |= (e_q[1] == e_qmax[0]) << 1;
+          eps0 |= (e_q[2] == e_qmax[0]) << 2;
+          eps0 |= (e_q[3] == e_qmax[0]) << 3;
+        }
+        lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
+        lep[0] = (ui8)e_q[3];
+        lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
+        lcxp[0] = (ui8)((rho[0] & 8) >> 3);
+
+        ui16 tuple0 = vlc_tbl0[(c_q0 << 8) + (rho[0] << 4) + eps0];
+        vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7);
+
+        if (c_q0 == 0)
+          mel_encode(&mel, rho[0] != 0);
+
+        int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0;
+        ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m);
+        m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0;
+        ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m);
+        m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0;
+        ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m);
+        m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0;
+        ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m);
+
+        if (x + 2 < width)
+        {
+          t = sp[0];
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[1] = 1;
+            e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[1] = e_q[4];
+            s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          t = height > 1 ? sp[stride] : 0;
+          ++sp;
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[1] += 2;
+            e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[1] = ojph_max(e_qmax[1], e_q[5]);
+            s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          if (x + 3 < width)
+          {
+            t = sp[0];
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] += 4;
+              e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = ojph_max(e_qmax[1], e_q[6]);
+              s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            t = height > 1 ? sp[stride] : 0;
+            ++sp;
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] += 8;
+              e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = ojph_max(e_qmax[1], e_q[7]);
+              s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+          }
+
+          int c_q1 = (rho[0] >> 1) | (rho[0] & 1);
+          int Uq1 = ojph_max(e_qmax[1], 1); //kappa_q = 1
+          u_q1 = Uq1 - 1; //kappa_q = 1
+
+          int eps1 = 0;
+          if (u_q1 > 0)
+          {
+            eps1 |= (e_q[4] == e_qmax[1]);
+            eps1 |= (e_q[5] == e_qmax[1]) << 1;
+            eps1 |= (e_q[6] == e_qmax[1]) << 2;
+            eps1 |= (e_q[7] == e_qmax[1]) << 3;
+          }
+          lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++;
+          lep[0] = (ui8)e_q[7];
+          lcxp[0] |= (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++;
+          lcxp[0] = (ui8)((rho[1] & 8) >> 3);
+          ui16 tuple1 = vlc_tbl0[(c_q1 << 8) + (rho[1] << 4) + eps1];
+          vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7);
+
+          if (c_q1 == 0)
+            mel_encode(&mel, rho[1] != 0);
+
+          int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0;
+          ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m);
+          m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0;
+          ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m);
+          m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0;
+          ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m);
+          m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0;
+          ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m);
+        }
+
+        if (u_q0 > 0 && u_q1 > 0)
+          mel_encode(&mel, ojph_min(u_q0, u_q1) > 2);
+
+        if (u_q0 > 2 && u_q1 > 2)
+        {
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].ext, uvlc_tbl[u_q0-2].ext_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].ext, uvlc_tbl[u_q1-2].ext_len);
+        }
+        else if (u_q0 > 2 && u_q1 > 0)
+        {
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, u_q1 - 1, 1);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len);
+        }
+        else
+        {
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len);
+        }
+
+        //prepare for next iteration
+        c_q0 = (rho[1] >> 1) | (rho[1] & 1);
+        s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0;
+        e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0;
+        rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0;
+      }
+
+      lep[1] = 0;
+
+      for (y = 2; y < height; y += 2)
+      {
+        lep = e_val;
+        int max_e = ojph_max(lep[0], lep[1]) - 1;
+        lep[0] = 0;
+        lcxp = cx_val;
+        c_q0 = lcxp[0] + (lcxp[1] << 2);
+        lcxp[0] = 0;
+
+        sp = buf + y * stride;
+        for (ui32 x = 0; x < width; x += 4)
+        {
+          //prepare two quads
+          t = sp[0];
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] = 1;
+            e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = e_q[0];
+            s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          t = y + 1 < height ? sp[stride] : 0;
+          ++sp;
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] += 2;
+            e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = ojph_max(e_qmax[0], e_q[1]);
+            s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          if (x + 1 < width)
+          {
+            t = sp[0];
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[0] += 4;
+              e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[0] = ojph_max(e_qmax[0], e_q[2]);
+              s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            t = y + 1 < height ? sp[stride] : 0;
+            ++sp;
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[0] += 8;
+              e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[0] = ojph_max(e_qmax[0], e_q[3]);
+              s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+          }
+
+          int kappa = (rho[0] & (rho[0]-1)) ? ojph_max(1,max_e) : 1;
+          int Uq0 = ojph_max(e_qmax[0], kappa);
+          int u_q0 = Uq0 - kappa, u_q1 = 0;
+
+          int eps0 = 0;
+          if (u_q0 > 0)
+          {
+            eps0 |= (e_q[0] == e_qmax[0]);
+            eps0 |= (e_q[1] == e_qmax[0]) << 1;
+            eps0 |= (e_q[2] == e_qmax[0]) << 2;
+            eps0 |= (e_q[3] == e_qmax[0]) << 3;
+          }
+          lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
+          max_e = ojph_max(lep[0], lep[1]) - 1;
+          lep[0] = (ui8)e_q[3];
+          lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
+          int c_q1 = lcxp[0] + (lcxp[1] << 2);
+          lcxp[0] = (ui8)((rho[0] & 8) >> 3);
+          ui16 tuple0 = vlc_tbl1[(c_q0 << 8) + (rho[0] << 4) + eps0];
+          vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7);
+
+          if (c_q0 == 0)
+              mel_encode(&mel, rho[0] != 0);
+
+          int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0;
+          ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m);
+          m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0;
+          ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m);
+          m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0;
+          ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m);
+          m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0;
+          ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m);
+
+          if (x + 2 < width)
+          {
+            t = sp[0];
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] = 1;
+              e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = e_q[4];
+              s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            t = y + 1 < height ? sp[stride] : 0;
+            ++sp;
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] += 2;
+              e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = ojph_max(e_qmax[1], e_q[5]);
+              s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            if (x + 3 < width)
+            {
+              t = sp[0];
+              val = t + t; //multiply by 2 and get rid of sign
+              val >>= p; // 2 \mu_p + x
+              val &= ~1ULL;// 2 \mu_p
+              if (val)
+              {
+                rho[1] += 4;
+                e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+                e_qmax[1] = ojph_max(e_qmax[1], e_q[6]);
+                s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+              }
+
+              t = y + 1 < height ? sp[stride] : 0;
+              ++sp;
+              val = t + t; //multiply by 2 and get rid of sign
+              val >>= p; // 2 \mu_p + x
+              val &= ~1ULL;// 2 \mu_p
+              if (val)
+              {
+                rho[1] += 8;
+                e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+                e_qmax[1] = ojph_max(e_qmax[1], e_q[7]);
+                s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+              }
+            }
+
+            kappa = (rho[1] & (rho[1]-1)) ? ojph_max(1,max_e) : 1;
+            c_q1 |= ((rho[0] & 4) >> 1) | ((rho[0] & 8) >> 2);
+            int Uq1 = ojph_max(e_qmax[1], kappa);
+            u_q1 = Uq1 - kappa;
+
+            int eps1 = 0;
+            if (u_q1 > 0)
+            {
+              eps1 |= (e_q[4] == e_qmax[1]);
+              eps1 |= (e_q[5] == e_qmax[1]) << 1;
+              eps1 |= (e_q[6] == e_qmax[1]) << 2;
+              eps1 |= (e_q[7] == e_qmax[1]) << 3;
+            }
+            lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++;
+            max_e = ojph_max(lep[0], lep[1]) - 1;
+            lep[0] = (ui8)e_q[7];
+            lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++;
+            c_q0 = lcxp[0] + (lcxp[1] << 2);
+            lcxp[0] = (ui8)((rho[1] & 8) >> 3);
+            ui16 tuple1 = vlc_tbl1[(c_q1 << 8) + (rho[1] << 4) + eps1];
+            vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7);
+
+            if (c_q1 == 0)
+              mel_encode(&mel, rho[1] != 0);
+
+            int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0;
+            ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m);
+            m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0;
+            ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m);
+            m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0;
+            ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m);
+            m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0;
+            ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m);
+          }
+
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len);
 
           //prepare for next iteration
           c_q0 |= ((rho[1] & 4) >> 1) | ((rho[1] & 8) >> 2);
diff --git a/src/core/coding/ojph_block_encoder.h b/src/core/coding/ojph_block_encoder.h
index 0c4b926..72b3c0d 100644
--- a/src/core/coding/ojph_block_encoder.h
+++ b/src/core/coding/ojph_block_encoder.h
@@ -52,11 +52,25 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void
-      ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes,
-                            ui32 width, ui32 height, ui32 stride,
-                            ui32* lengths, 
-                            ojph::mem_elastic_allocator *elastic,
-                            ojph::coded_lists *& coded);
+      ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes,
+                              ui32 width, ui32 height, ui32 stride,
+                              ui32* lengths, 
+                              ojph::mem_elastic_allocator *elastic,
+                              ojph::coded_lists *& coded);
+
+    void
+      ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes,
+                              ui32 width, ui32 height, ui32 stride,
+                              ui32* lengths, 
+                              ojph::mem_elastic_allocator *elastic,
+                              ojph::coded_lists *& coded);
+
+    void
+      ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
+                                 ui32 num_passes, ui32 width, ui32 height,
+                                 ui32 stride, ui32* lengths,
+                                 ojph::mem_elastic_allocator* elastic,
+                                 ojph::coded_lists*& coded);
 
     void
       ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs, 
@@ -64,6 +78,9 @@ namespace ojph {
                                    ui32 stride, ui32* lengths,
                                    ojph::mem_elastic_allocator *elastic,
                                    ojph::coded_lists *& coded);
+
+    bool initialize_block_encoder_tables_avx2();
+    bool initialize_block_encoder_tables_avx512();
   }
 }
 
diff --git a/src/core/coding/ojph_block_encoder_avx2.cpp b/src/core/coding/ojph_block_encoder_avx2.cpp
new file mode 100644
index 0000000..7624272
--- /dev/null
+++ b/src/core/coding/ojph_block_encoder_avx2.cpp
@@ -0,0 +1,1213 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+// Copyright (c) 2024, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_block_encoder_avx2.cpp
+//***************************************************************************/
+
+#include <cassert>
+#include <cstring>
+#include <cstdint>
+#include <climits>
+#include <immintrin.h>
+
+#include "ojph_mem.h"
+#include "ojph_arch.h"
+#include "ojph_block_encoder.h"
+#include "ojph_message.h"
+
+#ifdef OJPH_COMPILER_MSVC
+  #define likely(x)       (x)
+  #define unlikely(x)     (x)
+#else
+  #define likely(x)       __builtin_expect((x), 1)
+  #define unlikely(x)     __builtin_expect((x), 0)
+#endif
+
+namespace ojph {
+  namespace local {
+
+    /////////////////////////////////////////////////////////////////////////
+    // tables
+    /////////////////////////////////////////////////////////////////////////
+
+    //VLC encoding
+    // index is (c_q << 8) + (rho << 4) + eps
+    // data is  (cwd << 8) + (cwd_len << 4) + eps
+    // table 0 is for the initial line of quads
+    static ui32 vlc_tbl0[2048];
+    static ui32 vlc_tbl1[2048];
+
+    //UVLC encoding
+    static ui32 ulvc_cwd_pre[33];
+    static int ulvc_cwd_pre_len[33];
+    static ui32 ulvc_cwd_suf[33];
+    static int ulvc_cwd_suf_len[33];
+
+    /////////////////////////////////////////////////////////////////////////
+    static bool vlc_init_tables()
+    {
+      struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
+      vlc_src_table tbl0[] = {
+    #include "table0.h"
+      };
+      size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
+
+      si32 pattern_popcnt[16];
+      for (ui32 i = 0; i < 16; ++i)
+        pattern_popcnt[i] = (si32)population_count(i);
+
+      vlc_src_table* src_tbl = tbl0;
+      ui32 *tgt_tbl = vlc_tbl0;
+      size_t tbl_size = tbl0_size;
+      for (int i = 0; i < 2048; ++i)
+      {
+        int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
+        if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
+          tgt_tbl[i] = 0;
+        else
+        {
+          vlc_src_table *best_entry = NULL;
+          if (emb) // u_off = 1
+          {
+            int best_e_k = -1;
+            for (size_t j = 0; j < tbl_size; ++j)
+            {
+              if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
+                if (src_tbl[j].u_off == 1)
+                  if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
+                  {
+                    //now we need to find the smallest cwd with the highest
+                    // number of bits set in e_k
+                    int ones_count = pattern_popcnt[src_tbl[j].e_k];
+                    if (ones_count >= best_e_k)
+                    {
+                      best_entry = src_tbl + j;
+                      best_e_k = ones_count;
+                    }
+                  }
+            }
+          }
+          else // u_off = 0
+          {
+            for (size_t j = 0; j < tbl_size; ++j)
+            {
+              if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
+                if (src_tbl[j].u_off == 0)
+                {
+                  best_entry = src_tbl + j;
+                  break;
+                }
+            }
+          }
+          assert(best_entry);
+          tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
+                             + best_entry->e_k);
+        }
+      }
+
+      vlc_src_table tbl1[] = {
+    #include "table1.h"
+      };
+      size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
+
+      src_tbl = tbl1;
+      tgt_tbl = vlc_tbl1;
+      tbl_size = tbl1_size;
+      for (int i = 0; i < 2048; ++i)
+      {
+        int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
+        if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
+          tgt_tbl[i] = 0;
+        else
+        {
+          vlc_src_table *best_entry = NULL;
+          if (emb) // u_off = 1
+          {
+            int best_e_k = -1;
+            for (size_t j = 0; j < tbl_size; ++j)
+            {
+              if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
+                if (src_tbl[j].u_off == 1)
+                  if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
+                  {
+                    //now we need to find the smallest cwd with the highest
+                    // number of bits set in e_k
+                    int ones_count = pattern_popcnt[src_tbl[j].e_k];
+                    if (ones_count >= best_e_k)
+                    {
+                      best_entry = src_tbl + j;
+                      best_e_k = ones_count;
+                    }
+                  }
+            }
+          }
+          else // u_off = 0
+          {
+            for (size_t j = 0; j < tbl_size; ++j)
+            {
+              if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
+                if (src_tbl[j].u_off == 0)
+                {
+                  best_entry = src_tbl + j;
+                  break;
+                }
+            }
+          }
+          assert(best_entry);
+          tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
+                             + best_entry->e_k);
+        }
+      }
+
+
+      return true;
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static bool uvlc_init_tables()
+    {
+      //code goes from 0 to 31, extension and 32 are not supported here
+      ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
+      ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
+      ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
+      ulvc_cwd_pre_len[2] = 2;
+      ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
+      ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
+      ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
+      ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
+      ulvc_cwd_suf_len[2] = 0;
+      ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
+      for (int i = 5; i < 33; ++i)
+      {
+        ulvc_cwd_pre[i] = 0;
+        ulvc_cwd_pre_len[i] = 3;
+        ulvc_cwd_suf[i] = (ui32)(i-5);
+        ulvc_cwd_suf_len[i] = 5;
+      }
+      return true;
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static bool tables_initialized = false;
+
+    /////////////////////////////////////////////////////////////////////////
+    bool initialize_block_encoder_tables_avx2() {
+      if (!tables_initialized) {
+        memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
+        memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
+        tables_initialized = vlc_init_tables();
+        tables_initialized = tables_initialized && uvlc_init_tables();
+      }
+      return tables_initialized;
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    //
+    /////////////////////////////////////////////////////////////////////////
+    struct mel_struct {
+      //storage
+      ui8* buf;      //pointer to data buffer
+      ui32 pos;      //position of next writing within buf
+      ui32 buf_size; //size of buffer, which we must not exceed
+
+      // all these can be replaced by bytes
+      int remaining_bits; //number of empty bits in tmp
+      int tmp;            //temporary storage of coded bits
+      int run;            //number of 0 run
+      int k;              //state
+      int threshold;      //threshold where one bit must be coded
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
+    {
+      melp->buf = data;
+      melp->pos = 0;
+      melp->buf_size = buffer_size;
+      melp->remaining_bits = 8;
+      melp->tmp = 0;
+      melp->run = 0;
+      melp->k = 0;
+      melp->threshold = 1; // this is 1 << mel_exp[melp->k];
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    mel_emit_bit(mel_struct* melp, int v)
+    {
+      melp->tmp = (melp->tmp << 1) + v;
+      melp->remaining_bits--;
+      if (melp->remaining_bits == 0) {
+        melp->buf[melp->pos++] = (ui8)melp->tmp;
+        melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
+        melp->tmp = 0;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    mel_encode(mel_struct* melp, bool bit)
+    {
+      //MEL exponent
+      static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
+
+      if (bit == false) {
+        ++melp->run;
+        if (melp->run >= melp->threshold) {
+          mel_emit_bit(melp, 1);
+          melp->run = 0;
+          melp->k = ojph_min(12, melp->k + 1);
+          melp->threshold = 1 << mel_exp[melp->k];
+        }
+      } else {
+        mel_emit_bit(melp, 0);
+        int t = mel_exp[melp->k];
+        while (t > 0) {
+          mel_emit_bit(melp, (melp->run >> --t) & 1);
+        }
+        melp->run = 0;
+        melp->k = ojph_max(0, melp->k - 1);
+        melp->threshold = 1 << mel_exp[melp->k];
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    //
+    /////////////////////////////////////////////////////////////////////////
+    struct vlc_struct {
+      //storage
+      ui8* buf;      //pointer to data buffer
+      ui32 pos;      //position of next writing within buf
+      ui32 buf_size; //size of buffer, which we must not exceed
+
+      int used_bits; //number of occupied bits in tmp
+      ui64 tmp;       //temporary storage of coded bits
+      bool last_greater_than_8F; //true if last byte us greater than 0x8F
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    vlc_init(vlc_struct* vlcp, ui32 buffer_size, ui8* data)
+    {
+      vlcp->buf = data + buffer_size - 1; //points to last byte
+      vlcp->pos = 1;                      //locations will be all -pos
+      vlcp->buf_size = buffer_size;
+
+      vlcp->buf[0] = 0xFF;
+      vlcp->used_bits = 4;
+      vlcp->tmp = 0xF;
+      vlcp->last_greater_than_8F = true;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    vlc_encode(vlc_struct* vlcp, ui32 cwd, int cwd_len)
+    {
+      vlcp->tmp |= (ui64)cwd << vlcp->used_bits;
+      vlcp->used_bits += cwd_len;
+
+      while (vlcp->used_bits >= 8) {
+          ui8 tmp;
+
+          if (unlikely(vlcp->last_greater_than_8F)) {
+              tmp = vlcp->tmp & 0x7F;
+
+              if (likely(tmp != 0x7F)) {
+                  tmp = vlcp->tmp & 0xFF;
+                  *(vlcp->buf - vlcp->pos) = tmp;
+                  vlcp->last_greater_than_8F = tmp > 0x8F;
+                  vlcp->tmp >>= 8;
+                  vlcp->used_bits -= 8;
+              } else {
+                  *(vlcp->buf - vlcp->pos) = tmp;
+                  vlcp->last_greater_than_8F = false;
+                  vlcp->tmp >>= 7;
+                  vlcp->used_bits -= 7;
+              }
+
+          } else {
+              tmp = vlcp->tmp & 0xFF;
+              *(vlcp->buf - vlcp->pos) = tmp;
+              vlcp->last_greater_than_8F = tmp > 0x8F;
+              vlcp->tmp >>= 8;
+              vlcp->used_bits -= 8;
+          }
+
+          vlcp->pos++;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    terminate_mel_vlc(mel_struct* melp, vlc_struct* vlcp)
+    {
+      if (melp->run > 0)
+        mel_emit_bit(melp, 1);
+
+      if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
+        *(vlcp->buf - vlcp->pos) = 0x7f;
+        vlcp->pos++;
+        vlcp->tmp >>= 7;
+        vlcp->used_bits -= 7;
+      }
+
+      melp->tmp = melp->tmp << melp->remaining_bits;
+      int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
+      int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
+      if ((mel_mask | vlc_mask) == 0)
+        return;  //last mel byte cannot be 0xFF, since then
+                 //melp->remaining_bits would be < 8
+      if (melp->pos >= melp->buf_size)
+        OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
+      ui8 vlcp_tmp = (ui8)vlcp->tmp;
+      int fuse = melp->tmp | vlcp_tmp;
+      if ( ( ((fuse ^ melp->tmp) & mel_mask)
+           | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
+          && (fuse != 0xFF) && vlcp->pos > 1)
+      {
+        melp->buf[melp->pos++] = (ui8)fuse;
+      }
+      else
+      {
+        if (vlcp->pos >= vlcp->buf_size)
+          OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
+        melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
+        *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
+        vlcp->pos++;
+      }
+    }
+
+/////////////////////////////////////////////////////////////////////////
+//
+/////////////////////////////////////////////////////////////////////////
+    struct ms_struct {
+      //storage
+      ui8* buf;      //pointer to data buffer
+      ui32 pos;      //position of next writing within buf
+      ui32 buf_size; //size of buffer, which we must not exceed
+
+      int max_bits;  //maximum number of bits that can be store in tmp
+      int used_bits; //number of occupied bits in tmp
+      ui32 tmp;      //temporary storage of coded bits
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
+    {
+      msp->buf = data;
+      msp->pos = 0;
+      msp->buf_size = buffer_size;
+      msp->max_bits = 8;
+      msp->used_bits = 0;
+      msp->tmp = 0;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
+    {
+      while (cwd_len > 0)
+      {
+        if (msp->pos >= msp->buf_size)
+          OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
+        int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
+        msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
+        msp->used_bits += t;
+        cwd >>= t;
+        cwd_len -= t;
+        if (msp->used_bits >= msp->max_bits)
+        {
+          msp->buf[msp->pos++] = (ui8)msp->tmp;
+          msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
+          msp->tmp = 0;
+          msp->used_bits = 0;
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    ms_terminate(ms_struct* msp)
+    {
+      if (msp->used_bits)
+      {
+        int t = msp->max_bits - msp->used_bits; //unused bits
+        msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
+        msp->used_bits += t;
+        if (msp->tmp != 0xFF)
+        {
+          if (msp->pos >= msp->buf_size)
+            OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
+          msp->buf[msp->pos++] = (ui8)msp->tmp;
+        }
+      }
+      else if (msp->max_bits == 7)
+        msp->pos--;
+    }
+
+#define ZERO _mm256_setzero_si256()
+#define ONE  _mm256_set1_epi32(1)
+
+// https://stackoverflow.com/a/58827596
+inline __m256i avx2_lzcnt_epi32(__m256i v) {
+    // prevent value from being rounded up to the next power of two
+    v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);  // keep 8 MSB
+
+    v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));    // convert an integer to float
+    v = _mm256_srli_epi32(v, 23);                   // shift down the exponent
+    v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);  // undo bias
+    v = _mm256_min_epi16(v, _mm256_set1_epi32(32));    // clamp at 32
+
+    return v;
+}
+
+inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
+    return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
+}
+
+static void proc_pixel(__m256i *src_vec, ui32 p,
+                       __m256i *eq_vec, __m256i *s_vec,
+                       __m256i &rho_vec, __m256i &e_qmax_vec)
+{
+    __m256i val_vec[4];
+    __m256i _eq_vec[4];
+    __m256i _s_vec[4];
+    __m256i _rho_vec[4];
+
+    for (ui32 i = 0; i < 4; ++i) {
+        /* val = t + t; //multiply by 2 and get rid of sign */
+        val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
+
+        /* val >>= p;  // 2 \mu_p + x */
+        val_vec[i] = _mm256_srli_epi32(val_vec[i], (int)p);
+
+        /* val &= ~1u; // 2 \mu_p */
+        val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((int)~1u));
+
+        /* if (val) { */
+        const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
+
+        /*   rho[i] = 1 << i;
+         *   rho is processed below.
+         */
+
+        /*   e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
+        val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
+        _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
+        _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
+
+        /*   e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
+         *   e_qmax is processed below
+         */
+
+        /*   s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
+        val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
+        _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
+        _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
+
+        _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
+        _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
+        val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
+        /* } */
+    }
+
+    const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
+
+    /* Reorder from
+     * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7]
+     * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],.[1, 6], [1, 7]
+     * *_vec[2]:[0, 8], [0, 9], [0,10], [0,11], [0,12], [0,13], [0,14], [0,15]
+     * *_vec[3]:[1, 8], [1, 9], [1,10], [1,11], [1,12], [1,13], [1,14], [1,15]
+     * to
+     * *_vec[0]:[0, 0], [0, 2], [0, 4], [0, 6], [0, 8], [0,10], [0,12], [0,14]
+     * *_vec[1]:[1, 0], [1, 2], [1, 4], [1, 6], [1, 8], [1,10], [1,12], [1,14]
+     * *_vec[2]:[0, 1], [0, 3], [0, 5], [0, 7], [0, 9], [0,11], [0,13], [0,15]
+     * *_vec[3]:[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [1,11], [1,13], [1,15]
+     */
+    __m256i tmp1, tmp2;
+    for (ui32 i = 0; i < 2; ++i) {
+        tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
+        tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
+        eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
+        eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
+
+        tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
+        tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
+        s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
+        s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
+
+        tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
+        tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
+        _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
+        _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
+    }
+
+    e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
+    e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
+    e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
+    _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
+    _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
+    _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
+    rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
+    rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
+    rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
+}
+
+/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
+ *      [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
+ *      [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
+ *      [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
+ *
+ * to   [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
+ *       0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
+ *
+ *      [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
+ *       0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
+ *
+ *      [..]
+ */
+static void rotate_matrix(__m256i *matrix)
+{
+    __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
+    __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
+    __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
+    __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
+
+    matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
+    matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
+    matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
+    matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
+
+    tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
+    matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
+    matrix[0] = tmp1;
+
+    tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
+    matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
+    matrix[1] = tmp1;
+}
+
+static void proc_ms_encode(ms_struct *msp,
+                           __m256i &tuple_vec,
+                           __m256i &uq_vec,
+                           __m256i &rho_vec,
+                           __m256i *s_vec)
+{
+    __m256i m_vec[4];
+
+    /* Prepare parameters for ms_encode */
+    /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
+    auto tmp = _mm256_and_si256(tuple_vec, ONE);
+    tmp = _mm256_sub_epi32(uq_vec, tmp);
+    auto tmp1 = _mm256_and_si256(rho_vec, ONE);
+    auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
+    m_vec[0] = _mm256_and_si256(mask, tmp);
+
+    /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
+    tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
+    tmp = _mm256_srli_epi32(tmp, 1);
+    tmp = _mm256_sub_epi32(uq_vec, tmp);
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
+    mask = avx2_cmpneq_epi32(tmp1, ZERO);
+    m_vec[1] = _mm256_and_si256(mask, tmp);
+
+    /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
+    tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
+    tmp = _mm256_srli_epi32(tmp, 2);
+    tmp = _mm256_sub_epi32(uq_vec, tmp);
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
+    mask = avx2_cmpneq_epi32(tmp1, ZERO);
+    m_vec[2] = _mm256_and_si256(mask, tmp);
+
+    /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
+    tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
+    tmp = _mm256_srli_epi32(tmp, 3);
+    tmp = _mm256_sub_epi32(uq_vec, tmp);
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
+    mask = avx2_cmpneq_epi32(tmp1, ZERO);
+    m_vec[3] = _mm256_and_si256(mask, tmp);
+
+    rotate_matrix(m_vec);
+    /* s_vec from
+     * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30]
+     * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30]
+     * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31]
+     * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31]
+     * to
+     * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7]
+     * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15]
+     * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23]
+     * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31]
+     */
+    rotate_matrix(s_vec);
+
+    ui32 cwd[8];
+    int cwd_len[8];
+    ui64 _cwd = 0;
+    int _cwd_len = 0;
+
+    /* Each iteration process 8 bytes * 2 lines */
+    for (ui32 i = 0; i < 4; ++i) {
+        /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
+         * cwd_len = m
+         */
+        _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
+        tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
+        tmp = _mm256_sub_epi32(tmp, ONE);
+        tmp = _mm256_and_si256(tmp, s_vec[i]);
+        _mm256_storeu_si256((__m256i*)cwd, tmp);
+
+        for (ui32 j = 0; j < 4; ++j) {
+            ui32 idx = j * 2;
+            _cwd     = cwd[idx];
+            _cwd_len = cwd_len[idx];
+            _cwd     |= ((ui64)cwd[idx + 1]) << _cwd_len;
+            _cwd_len += cwd_len[idx + 1];
+            ms_encode(msp, _cwd, _cwd_len);
+        }
+    }
+}
+
+static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
+                           __m256i &e_qmax_vec)
+{
+    /* if (u_q[i] > 0) {
+     *     eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
+     *     eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
+     *     eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
+     *     eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
+     * }
+     */
+    auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
+
+    auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
+    auto eps_vec = _mm256_srli_epi32(mask, 31);
+
+    mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
+    auto tmp = _mm256_srli_epi32(mask, 31);
+    tmp = _mm256_slli_epi32(tmp, 1);
+    eps_vec = _mm256_or_si256(eps_vec, tmp);
+
+    mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
+    tmp = _mm256_srli_epi32(mask, 31);
+    tmp = _mm256_slli_epi32(tmp, 2);
+    eps_vec = _mm256_or_si256(eps_vec, tmp);
+
+    mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
+    tmp = _mm256_srli_epi32(mask, 31);
+    tmp = _mm256_slli_epi32(tmp, 3);
+    eps_vec = _mm256_or_si256(eps_vec, tmp);
+
+    return  _mm256_and_si256(u_q_mask, eps_vec);
+}
+
+static void update_lep(ui32 x, __m256i &prev_e_val_vec,
+                       __m256i *eq_vec, __m256i *e_val_vec,
+                       const __m256i left_shift)
+{
+    /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
+     * lep[0] = (ui8)e_q[3];
+     * Compare e_q[1] with e_q[3] of the prevous round.
+     */
+    auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
+    tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
+    prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
+    e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
+}
+
+
+static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec,
+                        __m256i &rho_vec, __m256i *cx_val_vec,
+                        const __m256i left_shift)
+{
+    /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
+     * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
+     * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
+     */
+    auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
+    tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
+    prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
+
+    tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
+    tmp = _mm256_srli_epi32(tmp, 3);
+
+    auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
+    tmp1 = _mm256_srli_epi32(tmp1, 1);
+    cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
+}
+
+static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
+                         __m256i &eps_vec, ui32 *vlc_tbl)
+{
+    /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
+    auto tmp = _mm256_slli_epi32(cq_vec, 8);
+    auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
+    tmp = _mm256_add_epi32(tmp, tmp1);
+    tmp = _mm256_add_epi32(tmp, eps_vec);
+    return _mm256_i32gather_epi32((const int *)vlc_tbl, tmp, 4);
+}
+
+static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
+                        const __m256i right_shift)
+{
+    ojph_unused(x);
+    ojph_unused(cx_val_vec);
+    ojph_unused(right_shift);
+
+    /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
+    auto tmp = _mm256_srli_epi32(rho_vec, 1);
+    auto tmp1 = _mm256_and_si256(rho_vec, ONE);
+    return _mm256_or_si256(tmp, tmp1);
+}
+
+static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
+                        const __m256i right_shift)
+{
+    // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
+    //            | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
+    auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
+    auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
+
+    tmp = _mm256_insert_epi64(tmp, _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
+    tmp = _mm256_slli_epi32(tmp, 2);
+    auto tmp1 = _mm256_insert_epi32(lcxp1_vec, _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
+    tmp = _mm256_add_epi32(tmp1, tmp);
+
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
+    tmp1 = _mm256_srli_epi32(tmp1, 1);
+    tmp = _mm256_or_si256(tmp, tmp1);
+
+    tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
+    tmp1 = _mm256_srli_epi32(tmp1, 2);
+
+    return _mm256_or_si256(tmp, tmp1);
+}
+
+using fn_proc_cq = __m256i (*)(ui32, __m256i *, __m256i &, const __m256i);
+
+static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec,
+                             __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
+                             const __m256i right_shift)
+{
+    int32_t mel_need_encode[8];
+    int32_t mel_need_encode2[8];
+    int32_t mel_bit[8];
+    int32_t mel_bit2[8];
+    /* Prepare mel_encode params */
+    /* if (c_q[i] == 0) { */
+    _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
+    /*   mel_encode(&mel, rho[i] != 0); */
+    _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
+    /* } */
+
+    /*   mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
+    auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
+    auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
+    _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
+
+    /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
+    auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
+    _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
+
+    ui32 i_max = 8 - (ignore / 2);
+
+    for (ui32 i = 0; i < i_max; i += 2) {
+        if (mel_need_encode[i]) {
+            mel_encode(melp, mel_bit[i]);
+        }
+
+        if (i + 1 < i_max) {
+            if (mel_need_encode[i + 1]) {
+                mel_encode(melp, mel_bit[i + 1]);
+            }
+        }
+
+        if (mel_need_encode2[i]) {
+            mel_encode(melp, mel_bit2[i]);
+        }
+    }
+}
+
+static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec,
+                             __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
+                             const __m256i right_shift)
+{
+    ojph_unused(u_q_vec);
+    ojph_unused(right_shift);
+    int32_t mel_need_encode[8];
+    int32_t mel_bit[8];
+
+    /* Prepare mel_encode params */
+    /* if (c_q[i] == 0) { */
+    _mm256_storeu_si256((__m256i*)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
+    /*   mel_encode(&mel, rho[i] != 0); */
+    _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
+    /* } */
+
+    ui32 i_max = 8 - (ignore / 2);
+
+    for (ui32 i = 0; i < i_max; ++i) {
+        if (mel_need_encode[i]) {
+            mel_encode(melp, mel_bit[i]);
+        }
+    }
+}
+
+using fn_proc_mel_encode = void (*)(mel_struct *, __m256i &, __m256i &,
+                                    __m256i, ui32, const __m256i);
+
+static void proc_vlc_encode1(vlc_struct *vlcp, ui32 *tuple,
+                             ui32 *u_q, ui32 ignore)
+{
+    ui32 i_max = 8 - (ignore / 2);
+
+    for (ui32 i = 0; i < i_max; i += 2) {
+        /* 7 bits */
+        ui32 val = tuple[i + 0] >> 4;
+        int size = tuple[i + 0] & 7;
+
+        if (i + 1 < i_max) {
+            /* 7 bits */
+            val |= (tuple[i + 1] >> 4) << size;
+            size += tuple[i + 1] & 7;
+        }
+
+        if (u_q[i] > 2 && u_q[i + 1] > 2) {
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
+            size += ulvc_cwd_pre_len[u_q[i] - 2];
+
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
+            size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
+            size += ulvc_cwd_suf_len[u_q[i] - 2];
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
+            size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
+
+        } else if (u_q[i] > 2 && u_q[i + 1] > 0) {
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i]]) << size;
+            size += ulvc_cwd_pre_len[u_q[i]];
+
+            /* 1 bit */
+            val |= (u_q[i + 1] - 1) << size;
+            size += 1;
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i]]) << size;
+            size += ulvc_cwd_suf_len[u_q[i]];
+
+        } else {
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i]]) << size;
+            size += ulvc_cwd_pre_len[u_q[i]];
+
+            /* 3 bits */
+            val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
+            size += ulvc_cwd_pre_len[u_q[i + 1]];
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i]]) << size;
+            size += ulvc_cwd_suf_len[u_q[i]];
+
+            /* 5 bits */
+            val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
+            size += ulvc_cwd_suf_len[u_q[i + 1]];
+        }
+
+        vlc_encode(vlcp, val, size);
+    }
+}
+
+static void proc_vlc_encode2(vlc_struct *vlcp, ui32 *tuple,
+                             ui32 *u_q, ui32 ignore)
+{
+    ui32 i_max = 8 - (ignore / 2);
+
+    for (ui32 i = 0; i < i_max; i += 2) {
+        /* 7 bits */
+        ui32 val = tuple[i + 0] >> 4;
+        int size = tuple[i + 0] & 7;
+
+        if (i + 1 < i_max) {
+            /* 7 bits */
+            val |= (tuple[i + 1] >> 4) << size;
+            size += tuple[i + 1] & 7;
+        }
+
+        /* 3 bits */
+        val |= ulvc_cwd_pre[u_q[i]] << size;
+        size += ulvc_cwd_pre_len[u_q[i]];
+
+        /* 3 bits */
+        val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
+        size += ulvc_cwd_pre_len[u_q[i + 1]];
+
+        /* 5 bits */
+        val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
+        size += ulvc_cwd_suf_len[u_q[i + 0]];
+
+        /* 5 bits */
+        val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
+        size += ulvc_cwd_suf_len[u_q[i + 1]];
+
+        vlc_encode(vlcp, val, size);
+    }
+}
+
+using fn_proc_vlc_encode = void (*)(vlc_struct *, ui32 *, ui32 *, ui32);
+
+void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
+                                ui32 num_passes, ui32 _width, ui32 height,
+                                ui32 stride, ui32* lengths,
+                                ojph::mem_elastic_allocator *elastic,
+                                ojph::coded_lists *& coded)
+{
+    ojph_unused(num_passes);                      //currently not used
+
+    ui32 width = (_width + 15) & ~15u;
+    ui32 ignore = width - _width;
+    const int ms_size = (16384 * 16 + 14) / 15; //more than enough
+    const int mel_vlc_size = 3072;              //more than enough
+    const int mel_size = 192;
+    const int vlc_size = mel_vlc_size - mel_size;
+
+    ui8 ms_buf[ms_size];
+    ui8 mel_vlc_buf[mel_vlc_size];
+    ui8 *mel_buf = mel_vlc_buf;
+    ui8 *vlc_buf = mel_vlc_buf + mel_size;
+
+    mel_struct mel;
+    mel_init(&mel, mel_size, mel_buf);
+    vlc_struct vlc;
+    vlc_init(&vlc, vlc_size, vlc_buf);
+    ms_struct ms;
+    ms_init(&ms, ms_size, ms_buf);
+
+    const ui32 p = 30 - missing_msbs;
+
+    //e_val: E values for a line (these are the highest set bit)
+    //cx_val: is the context values
+    //Each byte stores the info for the 2 sample. For E, it is maximum
+    // of the two samples, while for cx, it is the OR of these two samples.
+    //The maximum is between the pixel at the bottom left of one quad
+    // and the bottom right of the earlier quad. The same is true for cx.
+    //For a 1024 pixels, we need 512 bytes, the 2 extra,
+    // one for the non-existing earlier quad, and one for beyond the
+    // the end
+    const __m256i right_shift = _mm256_set_epi32(
+        0, 7, 6, 5, 4, 3, 2, 1
+    );
+
+    const __m256i left_shift = _mm256_set_epi32(
+        6, 5, 4, 3, 2, 1, 0, 7
+    );
+
+    ui32 n_loop = (width + 15) / 16;
+
+    __m256i e_val_vec[65];
+    for (ui32 i = 0; i <ojph_min(64, n_loop); ++i) {
+        e_val_vec[i] = ZERO;
+    }
+    __m256i prev_e_val_vec = ZERO;
+
+    __m256i cx_val_vec[65];
+    __m256i prev_cx_val_vec = ZERO;
+
+    ui32 prev_cq = 0;
+
+    __m256i eq_vec[4];
+    __m256i s_vec[4];
+    __m256i src_vec[4];
+
+    ui32 *vlc_tbl = vlc_tbl0;
+    fn_proc_cq proc_cq = proc_cq1;
+    fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
+    fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
+
+    /* 2 lines per iteration */
+    for (ui32 y = 0; y < height; y += 2)
+    {
+        e_val_vec[n_loop] = prev_e_val_vec;
+        /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
+        __m256i tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
+        cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
+
+        prev_e_val_vec = ZERO;
+        prev_cx_val_vec = ZERO;
+
+        ui32 *sp = buf + y * stride;
+
+        /* 16 bytes per iteration */
+        for (ui32 x = 0; x < n_loop; ++x) {
+
+            /* t = sp[i]; */
+            if ((x == (n_loop - 1)) && (_width % 16)) {
+                ui32 tmp_buf[16] = { 0 };
+                memcpy(tmp_buf, sp, (_width % 16) * sizeof(ui32));
+                src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
+                src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
+                if (y + 1 < height) {
+                    memcpy(tmp_buf, sp + stride, (_width % 16) * sizeof(ui32));
+                    src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
+                    src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
+                }
+                else {
+                    src_vec[1] = ZERO;
+                    src_vec[3] = ZERO;
+                }
+            }
+            else {
+                src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
+                src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
+
+                if (y + 1 < height) {
+                    src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
+                    src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
+                }
+                else {
+                    src_vec[1] = ZERO;
+                    src_vec[3] = ZERO;
+                }
+                sp += 16;
+            }
+
+            /* src_vec layout:
+             * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5],.[0, 6],.[0, 7]
+             * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5],.[1, 6],.[1, 7]
+             * src_vec[2]:[0, 8],[0, 9],[0,10],[0,11],[0,12],[0,13],.[0,14], [0,15]
+             * src_vec[3]:[1, 8],[1, 9],[1,10],[1,11],[1,12],[1,13],.[1,14], [1,15]
+             */
+            __m256i rho_vec, e_qmax_vec;
+            proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
+
+            // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
+            tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
+            tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
+
+            auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
+            max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
+
+            // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
+            tmp = _mm256_max_epi32(max_e_vec, ONE);
+            __m256i tmp1 = _mm256_sub_epi32(rho_vec, ONE);
+            tmp1 = _mm256_and_si256(rho_vec, tmp1);
+
+            auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
+            auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
+            auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
+            const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
+
+            /* cq[1 - 16] = cq_vec
+             * cq[0] = prev_cq_vec[0]
+             */
+            tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
+
+            auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
+            cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
+            prev_cq = (ui32)_mm256_extract_epi32(tmp, 7);
+
+            update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
+            update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
+
+            /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
+            /* u_q[i] = Uq[i] - kappa[i]; */
+            auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
+            auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
+
+            auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
+            __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
+            ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
+
+            proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
+                            right_shift);
+
+            proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
+
+            // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7);
+            // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7);
+            ui32 u_q[8];
+            ui32 tuple[8];
+            /* The tuple is scaled by 4 due to:
+             * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true);
+             * So in the vlc_encode, the tuple will only be scaled by 2.
+             */
+            tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
+            _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
+            _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
+
+            proc_vlc_encode(&vlc, tuple, u_q, _ignore);
+        }
+
+        tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
+        tmp = _mm256_slli_epi32(tmp, 2);
+        tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
+        prev_cq = (ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
+
+        proc_cq = proc_cq2;
+        vlc_tbl = vlc_tbl1;
+        proc_mel_encode = proc_mel_encode2;
+        proc_vlc_encode = proc_vlc_encode2;
+    }
+
+    ms_terminate(&ms);
+    terminate_mel_vlc(&mel, &vlc);
+
+    //copy to elastic
+    lengths[0] = mel.pos + vlc.pos + ms.pos;
+    elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
+    memcpy(coded->buf, ms.buf, ms.pos);
+    memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
+    memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
+
+    // put in the interface locator word
+    ui32 num_bytes = mel.pos + vlc.pos;
+    coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
+    coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
+    coded->buf[lengths[0]-2] =
+        (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
+
+    coded->avail_size -= lengths[0];
+}
+
+} /* namespace local */
+} /* namespace ojph */
diff --git a/src/core/coding/ojph_block_encoder_avx512.cpp b/src/core/coding/ojph_block_encoder_avx512.cpp
index 5912b09..b35373a 100644
--- a/src/core/coding/ojph_block_encoder_avx512.cpp
+++ b/src/core/coding/ojph_block_encoder_avx512.cpp
@@ -64,8 +64,8 @@ namespace ojph {
     // index is (c_q << 8) + (rho << 4) + eps
     // data is  (cwd << 8) + (cwd_len << 4) + eps
     // table 0 is for the initial line of quads
-    static ui32 vlc_tbl0[2048] = { 0 };
-    static ui32 vlc_tbl1[2048] = { 0 };
+    static ui32 vlc_tbl0[2048];
+    static ui32 vlc_tbl1[2048];
 
     //UVLC encoding
     static ui32 ulvc_cwd_pre[33];
@@ -218,18 +218,18 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    bool initialize_tables() {
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) {
-        bool result;
-        result = vlc_init_tables();
-        result = result && uvlc_init_tables();
-        return result;
-      }
-      return false;
-    }
+    static bool tables_initialized = false;
 
     /////////////////////////////////////////////////////////////////////////
-    static bool tables_initialized = initialize_tables();
+    bool initialize_block_encoder_tables_avx512() {
+      if (!tables_initialized) {
+        memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
+        memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
+        tables_initialized = vlc_init_tables();
+        tables_initialized = tables_initialized && uvlc_init_tables();
+      }
+      return tables_initialized;
+    }
 
     /////////////////////////////////////////////////////////////////////////
     //
@@ -377,6 +377,13 @@ namespace ojph {
       if (melp->run > 0)
         mel_emit_bit(melp, 1);
 
+      if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
+        *(vlcp->buf - vlcp->pos) = 0x7f;
+        vlcp->pos++;
+        vlcp->tmp >>= 7;
+        vlcp->used_bits -= 7;
+      }
+
       melp->tmp = melp->tmp << melp->remaining_bits;
       int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
       int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h
index 947f25b..29ab7a5 100644
--- a/src/core/common/ojph_arch.h
+++ b/src/core/common/ojph_arch.h
@@ -166,6 +166,32 @@ namespace ojph {
   #endif
   }
 
+  /////////////////////////////////////////////////////////////////////////////
+  static inline ui32 population_count64(ui64 val)
+  {
+  #if defined(OJPH_COMPILER_MSVC)  \
+    && (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
+    return (ui32)__popcnt64(val);
+  #elif (defined OJPH_COMPILER_GNUC)
+    return (ui32)__builtin_popcountll(val);
+  #else
+    const ui64 k1 = 0x5555555555555555ull;
+    const ui64 k2 = 0x3333333333333333ull;
+    const ui64 k4 = 0x0F0F0F0F0F0F0F0Full;
+    const ui64 kf = 0x0101010101010101ull;
+
+    // put count of each 2 bits into those 2 bits
+    val =  val       - ((val >> 1)  & k1); 
+    // put count of each 4 bits into those 4 bits
+    val = (val & k2) + ((val >> 2)  & k2);
+    // put count of each 8 bits into those 8 bits
+    val = (val       +  (val >> 4)) & k4 ; 
+    // returns 8 most significant bits of x + (x<<8) + (x<<16) + (x<<24) + ...
+    val = (val * kf) >> 56; 
+    return (ui32) val;
+  #endif
+  }  
+
   /////////////////////////////////////////////////////////////////////////////
 #ifdef OJPH_COMPILER_MSVC
   #pragma intrinsic(_BitScanReverse)
@@ -188,6 +214,29 @@ namespace ojph {
   #endif
   }
 
+  /////////////////////////////////////////////////////////////////////////////
+#ifdef OJPH_COMPILER_MSVC
+  #pragma intrinsic(_BitScanReverse64)
+#endif
+  static inline ui32 count_leading_zeros(ui64 val)
+  {
+  #ifdef OJPH_COMPILER_MSVC
+    unsigned long result = 0;
+    _BitScanReverse64(&result, val);
+    return 63 ^ (ui32)result;
+  #elif (defined OJPH_COMPILER_GNUC)
+    return (ui32)__builtin_clzll(val);
+  #else
+    val |= (val >> 1);
+    val |= (val >> 2);
+    val |= (val >> 4);
+    val |= (val >> 8);
+    val |= (val >> 16);
+    val |= (val >> 32);
+    return 64 - population_count64(val);
+  #endif
+  }  
+
   /////////////////////////////////////////////////////////////////////////////
 #ifdef OJPH_COMPILER_MSVC
   #pragma intrinsic(_BitScanForward)
@@ -237,9 +286,15 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   // constants
   ////////////////////////////////////////////////////////////////////////////
-  const ui32 byte_alignment = 64; // 64 bytes == 512 bits
-  const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
-  const ui32 object_alignment = 8;
+  #ifndef OJPH_EMSCRIPTEN
+    const ui32 byte_alignment = 64; // 64 bytes == 512 bits
+    const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
+    const ui32 object_alignment = 8;
+  #else
+    const ui32 byte_alignment = 16; // 16 bytes == 128 bits
+    const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
+    const ui32 object_alignment = 8;
+    #endif
 
   ////////////////////////////////////////////////////////////////////////////
   // templates for alignment
@@ -247,17 +302,17 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   // finds the size such that it is a multiple of byte_alignment
-  template <typename T, int N>
+  template <typename T, ui32 N>
   size_t calc_aligned_size(size_t size) {
     size = size * sizeof(T) + N - 1;
     size &= ~((1ULL << (31 - count_leading_zeros(N))) - 1);
-    size >>= (31 - count_leading_zeros(sizeof(T)));
+    size >>= (63 - count_leading_zeros((ui64)sizeof(T)));
     return size;
   }
 
   ////////////////////////////////////////////////////////////////////////////
   // moves the pointer to first address that is a multiple of byte_alignment
-  template <typename T, int N>
+  template <typename T, ui32 N>
   inline T *align_ptr(T *ptr) {
     intptr_t p = reinterpret_cast<intptr_t>(ptr);
     p += N - 1;
diff --git a/src/core/common/ojph_codestream.h b/src/core/common/ojph_codestream.h
index 5f6dcdb..f7a8065 100644
--- a/src/core/common/ojph_codestream.h
+++ b/src/core/common/ojph_codestream.h
@@ -57,10 +57,11 @@ namespace ojph {
   class param_siz;
   class param_cod;
   class param_qcd;
+  class param_nlt;
   class comment_exchange;
   class mem_fixed_allocator;
   struct point;
-  struct line_buf;
+  class line_buf;
   class outfile_base;
   class infile_base;
 
@@ -318,7 +319,7 @@ namespace ojph {
      * @brief Returns the underlying SIZ marker segment object
      * 
      * @return param_siz This object holds SIZ marker segment information,
-     *                   which are related to codestream dimensions, number
+     *                   which deals with codestream dimensions, number
      *                   of components, bit depth, ... etc.
      */
     param_siz access_siz();
@@ -327,7 +328,7 @@ namespace ojph {
      * @brief Returns the underlying COD marker segment object
      * 
      * @return param_cod This object holds COD marker segment information,
-     *                   which are related to coding parameters, such as
+     *                   which deals with coding parameters, such as
      *                   codeblock sizes, progression order, reversible,
      *                   ... etc.
      */
@@ -337,11 +338,20 @@ namespace ojph {
      * @brief Returns the underlying QCD marker segment object
      * 
      * @return param_qcd This object holds QCD marker segment information,
-     *                   which are related to quantization parameters --
+     *                   which deals with quantization parameters --
      *                   quantization step size for each subband.
      */
     param_qcd access_qcd();
 
+    /**
+     * @brief Returns the underlying NLT marker segment object
+     * 
+     * @return param_nlt This object holds NLT marker segment information,
+     *                   which deals with non-linearity point transformation
+     *                   for each component.
+     */
+    param_nlt access_nlt();
+
     /**
      * @brief Query if the codestream extraction is planar or not.
      * See the documentation for ojph::codestream::set_planar()
diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h
index d7497cd..99897f3 100644
--- a/src/core/common/ojph_mem.h
+++ b/src/core/common/ojph_mem.h
@@ -132,9 +132,23 @@ namespace ojph {
   };
 
   /////////////////////////////////////////////////////////////////////////////
-  struct line_buf
+  class line_buf
   {
-    line_buf() : size(0), pre_size(0), i32(0) {}
+  public:
+    enum : ui32 {
+      LFT_UNDEFINED  = 0x00, // Type is undefined/uninitialized
+                             // These flags reflects data size in bytes
+      LFT_BYTE       = 0x01, // Set when data is 1 byte
+      LFT_16BIT      = 0x02, // Set when data is 2 bytes
+      LFT_32BIT      = 0x04, // Set when data is 4 bytes
+      LFT_64BIT      = 0x08, // Set when data is 8 bytes
+      LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding
+                             // Not all combinations are useful
+      LFT_SIZE_MASK  = 0x0F, // To extract data size
+    };
+
+  public:
+    line_buf() : size(0), pre_size(0), flags(LFT_UNDEFINED), i32(0) {}
 
     template<typename T>
     void pre_alloc(mem_fixed_allocator *p, size_t num_ele, ui32 pre_size)
@@ -153,9 +167,12 @@ namespace ojph {
 
     size_t size;
     ui32 pre_size;
+    ui32 flags;
     union {
-      si32* i32;
-      float* f32;
+      si32* i32;  // 32bit integer type, used for lossless compression
+      si64* i64;  // 64bit integer type, used for lossless compression
+      float* f32; // float type, used for lossy compression
+      void* p;    // no type is associated with the pointer
     };
   };
 
diff --git a/src/core/common/ojph_params.h b/src/core/common/ojph_params.h
index 0dce0ce..602fd99 100644
--- a/src/core/common/ojph_params.h
+++ b/src/core/common/ojph_params.h
@@ -52,6 +52,7 @@ namespace ojph {
     struct param_qcd;
     struct param_qcc;
     struct param_cap;
+    struct param_nlt;
     class codestream;
   }
 
@@ -131,6 +132,47 @@ namespace ojph {
     local::param_qcd* state;
   };
 
+  /**
+    * @brief non-linearity point transformation object
+    *        (implements NLT marker segment)
+    * 
+    */
+  class OJPH_EXPORT param_nlt
+  {
+  public:
+    enum special_comp_num : ui16 { ALL_COMPS = 65535 };
+  public:
+    param_nlt(local::param_nlt* p) : state(p) {}
+
+    /**
+      * @brief enables or disables type 3 nonlinearity for a component 
+      *        or the default setting
+      * 
+      * If you think that you need type 3 nonlinearity for all components,
+      * call this function with comp_num set to 65535 and enable to true.
+      * 
+      * @param comp_num: component number, or 65535 for the default setting
+      * @param enable: true to enable nlt type 3 for this component or the 
+                       default setting, false to disable nlt type 3.
+      */
+    void set_type3_transformation(ui32 comp_num, bool enable);
+
+    /**
+      * @brief get the state (enabled or disabled) of type 3 nonlinearity 
+      *        for a component or the default setting
+      *
+      * @param comp_num: component number, or 65535 for the default setting
+      * @param bit_depth: returns the bit depth of the component/default
+      * @param is_signed: returns true if the component/default is signed
+      * @return true if enabled or false if not.
+      */
+    bool get_type3_transformation(ui32 comp_num, ui8& bit_depth, 
+                                  bool& is_signed);
+
+  private:
+    local::param_nlt* state;
+  };
+
   ////////////////////////////////////////////////////////////////////////////
   class OJPH_EXPORT comment_exchange
   {
diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h
index 593d4b7..00faf75 100644
--- a/src/core/common/ojph_version.h
+++ b/src/core/common/ojph_version.h
@@ -34,5 +34,5 @@
 //***************************************************************************/
 
 #define OPENJPH_VERSION_MAJOR 0
-#define OPENJPH_VERSION_MINOR 15
+#define OPENJPH_VERSION_MINOR 18
 #define OPENJPH_VERSION_PATCH 0
diff --git a/src/core/others/ojph_mem.cpp b/src/core/others/ojph_mem.cpp
index b70d51e..0bb0b5f 100644
--- a/src/core/others/ojph_mem.cpp
+++ b/src/core/others/ojph_mem.cpp
@@ -65,22 +65,42 @@ namespace ojph {
     f32 = p->post_alloc_data<float>(size, pre_size);
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  template<>
+  void line_buf::finalize_alloc<si64>(mem_fixed_allocator *p)
+  {
+    assert(p != 0 && size != 0);
+    i64 = p->post_alloc_data<si64>(size, pre_size);
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   template<>
   void line_buf::wrap(si32 *buffer, size_t num_ele, ui32 pre_size)
   {
-    i32 = buffer;
+    this->i32 = buffer;
     this->size = num_ele;
     this->pre_size = pre_size;
+    this->flags = LFT_32BIT | LFT_REVERSIBLE;
   }
 
   ////////////////////////////////////////////////////////////////////////////
   template<>
   void line_buf::wrap(float *buffer, size_t num_ele, ui32 pre_size)
   {
-    f32 = buffer;
+    this->f32 = buffer;
+    this->size = num_ele;
+    this->pre_size = pre_size;
+    this->flags = LFT_32BIT;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  template<>
+  void line_buf::wrap(si64 *buffer, size_t num_ele, ui32 pre_size)
+  {
+    this->i64 = buffer;
     this->size = num_ele;
     this->pre_size = pre_size;
+    this->flags = LFT_64BIT | LFT_REVERSIBLE;
   }
 
   ////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index fb42a7d..a98b477 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -39,53 +39,66 @@
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 #include "ojph_colour_local.h"
 
 namespace ojph {
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_si32_to_si32_shftd)
-      (const si32 *sp, si32 *dp, int shift, ui32 width) = NULL;
+    void (*rev_convert)
+      (const line_buf *src_line, const ui32 src_line_offset, 
+       line_buf *dst_line, const ui32 dst_line_offset, 
+       si64 shift, ui32 width) = NULL;
+
+    //////////////////////////////////////////////////////////////////////////
+    void (*rev_convert_nlt_type3)
+      (const line_buf *src_line, const ui32 src_line_offset, 
+       line_buf *dst_line, const ui32 dst_line_offset, 
+       si64 shift, ui32 width) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*cnvrt_si32_to_float_shftd)
       (const si32 *sp, float *dp, float mul, ui32 width) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*cnvrt_si32_to_float)
       (const si32 *sp, float *dp, float mul, ui32 width) = NULL;
       
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*cnvrt_float_to_si32_shftd)
       (const float *sp, si32 *dp, float mul, ui32 width) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*cnvrt_float_to_si32)
       (const float *sp, si32 *dp, float mul, ui32 width) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*rct_forward)
-      (const si32 *r, const si32 *g, const si32 *b,
-       si32 *y, si32 *cb, si32 *cr, ui32 repeat) = NULL;
+      (const line_buf* r, const line_buf* g, const line_buf* b,
+       line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*rct_backward)
-      (const si32 *y, const si32 *cb, const si32 *cr,
-       si32 *r, si32 *g, si32 *b, ui32 repeat) = NULL;
+      (const line_buf* r, const line_buf* g, const line_buf* b,
+       line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*ict_forward)
       (const float *r, const float *g, const float *b,
        float *y, float *cb, float *cr, ui32 repeat) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     void (*ict_backward)
       (const float *y, const float *cb, const float *cr,
        float *r, float *g, float *b, ui32 repeat) = NULL;
 
-    ////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     static bool colour_transform_functions_initialized = false;
 
     //////////////////////////////////////////////////////////////////////////
@@ -96,7 +109,8 @@ namespace ojph {
 
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
-      cnvrt_si32_to_si32_shftd = gen_cnvrt_si32_to_si32_shftd;
+      rev_convert = gen_rev_convert;
+      rev_convert_nlt_type3 = gen_rev_convert_nlt_type3;
       cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd;
       cnvrt_si32_to_float = gen_cnvrt_si32_to_float;
       cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd;
@@ -125,9 +139,10 @@ namespace ojph {
       #ifndef OJPH_DISABLE_SSE2
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2)
         {
+          rev_convert = sse2_rev_convert;
+          rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3;
           cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd;
           cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32;
-          cnvrt_si32_to_si32_shftd = sse2_cnvrt_si32_to_si32_shftd;
           rct_forward = sse2_rct_forward;
           rct_backward = sse2_rct_backward;
         }
@@ -148,7 +163,8 @@ namespace ojph {
       #ifndef OJPH_DISABLE_AVX2
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2)
         {
-          cnvrt_si32_to_si32_shftd = avx2_cnvrt_si32_to_si32_shftd;
+          rev_convert = avx2_rev_convert;
+          rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3;
           rct_forward = avx2_rct_forward;
           rct_backward = avx2_rct_backward;
         }
@@ -161,7 +177,9 @@ namespace ojph {
   #endif // !OJPH_DISABLE_SIMD
 
 #else // OJPH_ENABLE_WASM_SIMD
-      cnvrt_si32_to_si32_shftd = wasm_cnvrt_si32_to_si32_shftd;
+
+      rev_convert = wasm_rev_convert;
+      rev_convert_nlt_type3 = wasm_rev_convert_nlt_type3;
       cnvrt_si32_to_float_shftd = wasm_cnvrt_si32_to_float_shftd;
       cnvrt_si32_to_float = wasm_cnvrt_si32_to_float;
       cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd;
@@ -170,6 +188,7 @@ namespace ojph {
       rct_backward = wasm_rct_backward;
       ict_forward = wasm_ict_forward;
       ict_backward = wasm_ict_backward;
+
 #endif // !OJPH_ENABLE_WASM_SIMD
 
       colour_transform_functions_initialized = true;
@@ -193,11 +212,79 @@ namespace ojph {
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                      ui32 width)
+    void gen_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width)
     {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = *sp++ + shift;
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          si32 s = (si32)shift;
+          for (ui32 i = width; i > 0; --i)
+            *dp++ = *sp++ + s;
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          for (ui32 i = width; i > 0; --i)
+            *dp++ = *sp++ + shift;
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        for (ui32 i = width; i > 0; --i)
+          *dp++ = (si32)(*sp++ + shift);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          si32 s = (si32)shift;
+          for (ui32 i = width; i > 0; --i) {
+            const si32 v = *sp++;
+            *dp++ = v >= 0 ? v : (- v - s);
+          }
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          for (ui32 i = width; i > 0; --i) {
+            const si64 v = *sp++;
+            *dp++ = v >= 0 ? v : (- v - shift);
+          }
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        for (ui32 i = width; i > 0; --i) {
+          const si64 v = *sp++;
+          *dp++ = (si32)(v >= 0 ? v : (- v - shift));
+        }
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -233,26 +320,104 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                         si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    void gen_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
     {
-      for (ui32 i = repeat; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+      
+      if  (y->flags & line_buf::LFT_32BIT)
       {
-        *y++ = (*r + (*g << 1) + *b) >> 2;
-        *cb++ = (*b++ - *g);
-        *cr++ = (*r++ - *g++);
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));        
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si32 rr = *rp++, gg = *gp++, bb = *bp++;
+          *yp++ = (rr + (gg << 1) + bb) >> 2;
+          *cbp++ = (bb - gg);
+          *crp++ = (rr - gg);
+        }
+      }
+      else 
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si64 rr = *rp++, gg = *gp++, bb = *bp++;
+          *yp++ = (rr + (gg << 1) + bb) >> 2;
+          *cbp++ = (bb - gg);
+          *crp++ = (rr - gg);
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                          si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void gen_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
     {
-      for (ui32 i = repeat; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+
+      if (y->flags & line_buf::LFT_32BIT)
       {
-        *g = *y++ - ((*cb + *cr)>>2);
-        *b++ = *cb++ + *g;
-        *r++ = *cr++ + *g++;
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si32 yy = *yp++, cbb = *cbp++, crr = *crp++;
+          si32 gg = yy - ((cbb + crr) >> 2);
+          *rp++ = crr + gg;
+          *gp++ = gg;
+          *bp++ = cbb + gg;
+        }
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));   
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si64 yy = *yp++, cbb = *cbp++, crr = *crp++;
+          si64 gg = yy - ((cbb + crr) >> 2);
+          *rp++ = (si32)(crr + gg);
+          *gp++ = (si32)gg;
+          *bp++ = (si32)(cbb + gg);
+        }
       }
     }
 
diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h
index 212848b..cc42aaa 100644
--- a/src/core/transform/ojph_colour.h
+++ b/src/core/transform/ojph_colour.h
@@ -40,14 +40,26 @@
 #define OJPH_COLOR_H
 
 namespace ojph {
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
 
   ////////////////////////////////////////////////////////////////////////////
   void init_colour_transform_functions();
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_si32_to_si32_shftd)
-    (const si32 *sp, si32 *dp, int shift, ui32 width);
+  extern void (*rev_convert)
+    (const line_buf *src_line, const ui32 src_line_offset, 
+     line_buf *dst_line, const ui32 dst_line_offset, 
+     si64 shift, ui32 width);
+
+  ////////////////////////////////////////////////////////////////////////////
+  extern void (*rev_convert_nlt_type3)
+    (const line_buf *src_line, const ui32 src_line_offset, 
+     line_buf *dst_line, const ui32 dst_line_offset, 
+     si64 shift, ui32 width);
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*cnvrt_si32_to_float_shftd)
@@ -67,13 +79,13 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*rct_forward)
-    (const si32 *r, const si32 *g, const si32 *b,
-     si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    (const line_buf *r, const line_buf *g, const line_buf *b,
+     line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*rct_backward)
-    (const si32 *y, const si32 *cb, const si32 *cr,
-     si32 *r, si32 *g, si32 *b, ui32 repeat);
+    (const line_buf *y, const line_buf *cb, const line_buf *cr,
+     line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*ict_forward)
diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index 60e20d6..05bff31 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -35,10 +35,12 @@
 // Date: 11 October 2019
 //***************************************************************************/
 
+#include <climits>
 #include <cmath>
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 
 #include <immintrin.h>
@@ -46,61 +48,392 @@
 namespace ojph {
   namespace local {
 
+    /////////////////////////////////////////////////////////////////////////
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline 
+    __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) 
+    {
+      // note than m must be obtained using
+      // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
+      __m256i x = _mm256_srli_epi64(a, amt);
+      x = _mm256_xor_si256(x, m);
+      __m256i result = _mm256_sub_epi64(x, m);
+      return result;
+    }
+
     //////////////////////////////////////////////////////////////////////////
-    void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width)
+    void avx2_rev_convert(const line_buf *src_line, 
+                          const ui32 src_line_offset,
+                          line_buf *dst_line, 
+                          const ui32 dst_line_offset, 
+                          si64 shift, ui32 width)
     {
-      __m256i sh = _mm256_set1_epi32(shift);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi32((si32)shift);
+          for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+          {
+            __m256i s = _mm256_loadu_si256((__m256i*)sp);
+            s = _mm256_add_epi32(s, sh);
+            _mm256_storeu_si256((__m256i*)dp, s);
+          }            
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi64x(shift);
+          for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+          {
+            __m256i s, t;
+            s = _mm256_loadu_si256((__m256i*)sp);
+            
+            t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0));
+            t = _mm256_add_epi64(t, sh);
+            _mm256_storeu_si256((__m256i*)dp, t);
+            
+            t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1));
+            t = _mm256_add_epi64(t, sh);
+            _mm256_storeu_si256((__m256i*)dp + 1, t);
+          }            
+        }
+      }
+      else 
       {
-        __m256i s = _mm256_loadu_si256((__m256i*)sp);
-        s = _mm256_add_epi32(s, sh);
-        _mm256_storeu_si256((__m256i*)dp, s);
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
+                                             0, (si64)ULLONG_MAX);
+        __m256i sh = _mm256_set1_epi64x(shift);
+        for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+        {
+          __m256i s, t;
+          s = _mm256_loadu_si256((__m256i*)sp);
+          s = _mm256_add_epi64(s, sh);
+
+          t = _mm256_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm256_and_si256(low_bits, t);
+
+          s = _mm256_loadu_si256((__m256i*)sp + 1);
+          s = _mm256_add_epi64(s, sh);
+
+          s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
+          s = _mm256_andnot_si256(low_bits, s);
+          
+          t = _mm256_or_si256(s, t);
+          t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
+          _mm256_storeu_si256((__m256i*)dp, t);
+        }            
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    void avx2_rev_convert_nlt_type3(const line_buf *src_line, 
+                                    const ui32 src_line_offset, 
+                                    line_buf *dst_line, 
+                                    const ui32 dst_line_offset, 
+                                    si64 shift, ui32 width)
     {
-      for (int i = (repeat + 7) >> 3; i > 0; --i)
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi32((si32)(-shift));
+          __m256i zero = _mm256_setzero_si256();
+          for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+          {
+            __m256i s = _mm256_loadu_si256((__m256i*)sp);
+            __m256i c = _mm256_cmpgt_epi32(zero, s);  // 0xFFFFFFFF for -ve val
+            __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value 
+            v_m_sh = _mm256_and_si256(c, v_m_sh);     // keep only -shift-val
+            s = _mm256_andnot_si256(c, s);            // keep only +ve or 0
+            s = _mm256_or_si256(s, v_m_sh);           // combine
+            _mm256_storeu_si256((__m256i*)dp, s);
+          }
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi64x(-shift);
+          __m256i zero = _mm256_setzero_si256();
+          for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+          {
+            __m256i s, t, u0, u1, c, v_m_sh;
+            s = _mm256_loadu_si256((__m256i*)sp);
+
+            t = _mm256_cmpgt_epi32(zero, s);      // find -ve 32bit -1
+            u0 = _mm256_unpacklo_epi32(s, t);     // correct 64bit data
+            c = _mm256_unpacklo_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm256_sub_epi64(sh, u0);    // - shift - value 
+            v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
+            u0 = _mm256_andnot_si256(c, u0);      // keep only +ve or 0
+            u0 = _mm256_or_si256(u0, v_m_sh);     // combine
+
+            u1 = _mm256_unpackhi_epi32(s, t);     // correct 64bit data
+            c = _mm256_unpackhi_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm256_sub_epi64(sh, u1);    // - shift - value 
+            v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
+            u1 = _mm256_andnot_si256(c, u1);      // keep only +ve or 0
+            u1 = _mm256_or_si256(u1, v_m_sh);     // combine
+
+            t = _mm256_permute2x128_si256(u0, u1, (2 << 4) | 0);
+            _mm256_storeu_si256((__m256i*)dp, t);
+
+            t = _mm256_permute2x128_si256(u0, u1, (3 << 4) | 1);
+            _mm256_storeu_si256((__m256i*)dp + 1, t);
+          }
+        }
+      }
+      else 
       {
-        __m256i mr = _mm256_load_si256((__m256i*)r);
-        __m256i mg = _mm256_load_si256((__m256i*)g);
-        __m256i mb = _mm256_load_si256((__m256i*)b);
-        __m256i t = _mm256_add_epi32(mr, mb);
-        t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1));
-        _mm256_store_si256((__m256i*)y, _mm256_srai_epi32(t, 2));
-        t = _mm256_sub_epi32(mb, mg);
-        _mm256_store_si256((__m256i*)cb, t);
-        t = _mm256_sub_epi32(mr, mg);
-        _mm256_store_si256((__m256i*)cr, t);
-
-        r += 8; g += 8; b += 8;
-        y += 8; cb += 8; cr += 8;
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m256i sh = _mm256_set1_epi64x(-shift);
+        __m256i zero = _mm256_setzero_si256();
+        __m256i half_mask = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
+                                              0, (si64)ULLONG_MAX);
+        for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          __m256i s, t, p, n, m, tm;
+          s = _mm256_loadu_si256((__m256i*)sp);
+          
+          m = _mm256_cmpgt_epi64(zero, s);    // 64b -1 for -ve value
+          tm = _mm256_sub_epi64(sh, s);       // - shift - value
+          n = _mm256_and_si256(m, tm);        // -ve
+          p = _mm256_andnot_si256(m, s);      // +ve
+          tm = _mm256_or_si256(n, p);
+          tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm256_and_si256(half_mask, tm);
+
+          s = _mm256_loadu_si256((__m256i*)sp + 1);
+          m = _mm256_cmpgt_epi64(zero, s);    // 64b -1 for -ve value
+          tm = _mm256_sub_epi64(sh, s);       // - shift - value
+          n = _mm256_and_si256(m, tm);        // -ve
+          p = _mm256_andnot_si256(m, s);      // +ve
+          tm = _mm256_or_si256(n, p);
+          tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
+          tm = _mm256_andnot_si256(half_mask, tm);
+
+          t = _mm256_or_si256(t, tm);
+          t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
+           _mm256_storeu_si256((__m256i*)dp, t);
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void avx2_rct_forward(const line_buf *r, 
+                          const line_buf *g, 
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr, 
+                          ui32 repeat)
+    {
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+      
+      if  (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));        
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i mr = _mm256_load_si256((__m256i*)rp);
+          __m256i mg = _mm256_load_si256((__m256i*)gp);
+          __m256i mb = _mm256_load_si256((__m256i*)bp);
+          __m256i t = _mm256_add_epi32(mr, mb);
+          t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1));
+          _mm256_store_si256((__m256i*)yp, _mm256_srai_epi32(t, 2));
+          t = _mm256_sub_epi32(mb, mg);
+          _mm256_store_si256((__m256i*)cbp, t);
+          t = _mm256_sub_epi32(mr, mg);
+          _mm256_store_si256((__m256i*)crp, t);
+
+          rp += 8; gp += 8; bp += 8;
+          yp += 8; cbp += 8; crp += 8;
+        }
+      }
+      else 
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i mr32 = _mm256_load_si256((__m256i*)rp);
+          __m256i mg32 = _mm256_load_si256((__m256i*)gp);
+          __m256i mb32 = _mm256_load_si256((__m256i*)bp);
+          __m256i mr, mg, mb, t;
+          mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0));
+          mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0));
+          mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0));
+          
+          t = _mm256_add_epi64(mr, mb);
+          t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
+          _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
+          t = _mm256_sub_epi64(mb, mg);
+          _mm256_store_si256((__m256i*)cbp, t);
+          t = _mm256_sub_epi64(mr, mg);
+          _mm256_store_si256((__m256i*)crp, t);
+
+          yp += 4; cbp += 4; crp += 4;
+
+          mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1));
+          mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1));
+          mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1));
+          
+          t = _mm256_add_epi64(mr, mb);
+          t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
+          _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
+          t = _mm256_sub_epi64(mb, mg);
+          _mm256_store_si256((__m256i*)cbp, t);
+          t = _mm256_sub_epi64(mr, mg);
+          _mm256_store_si256((__m256i*)crp, t);
+
+          rp += 8; gp += 8; bp += 8;
+          yp += 4; cbp += 4; crp += 4;
+        }
+      }
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rct_backward(const line_buf *y, 
+                           const line_buf *cb, 
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b, 
+                           ui32 repeat)
     {
-      for (int i = (repeat + 7) >> 3; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+
+      if (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i my  = _mm256_load_si256((__m256i*)yp);
+          __m256i mcb = _mm256_load_si256((__m256i*)cbp);
+          __m256i mcr = _mm256_load_si256((__m256i*)crp);
+
+          __m256i t = _mm256_add_epi32(mcb, mcr);
+          t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2));
+          _mm256_store_si256((__m256i*)gp, t);
+          __m256i u = _mm256_add_epi32(mcb, t);
+          _mm256_store_si256((__m256i*)bp, u);
+          u = _mm256_add_epi32(mcr, t);
+          _mm256_store_si256((__m256i*)rp, u);
+
+          yp += 8; cbp += 8; crp += 8;
+          rp += 8; gp += 8; bp += 8;
+        }        
+      }
+      else
       {
-        __m256i my  = _mm256_load_si256((__m256i*)y);
-        __m256i mcb = _mm256_load_si256((__m256i*)cb);
-        __m256i mcr = _mm256_load_si256((__m256i*)cr);
-
-        __m256i t = _mm256_add_epi32(mcb, mcr);
-        t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2));
-        _mm256_store_si256((__m256i*)g, t);
-        __m256i u = _mm256_add_epi32(mcb, t);
-        _mm256_store_si256((__m256i*)b, u);
-        u = _mm256_add_epi32(mcr, t);
-        _mm256_store_si256((__m256i*)r, u);
-
-        y += 8; cb += 8; cr += 8;
-        r += 8; g += 8; b += 8;
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
+        __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, 
+                                             0, (si64)ULLONG_MAX);
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i my, mcb, mcr, tr, tg, tb;          
+          my  = _mm256_load_si256((__m256i*)yp);
+          mcb = _mm256_load_si256((__m256i*)cbp);
+          mcr = _mm256_load_si256((__m256i*)crp);
+
+          tg = _mm256_add_epi64(mcb, mcr);
+          tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
+          tb = _mm256_add_epi64(mcb, tg);
+          tr = _mm256_add_epi64(mcr, tg);
+
+          __m256i mr, mg, mb;
+          mr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
+          mr = _mm256_and_si256(low_bits, mr);
+          mg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
+          mg = _mm256_and_si256(low_bits, mg);
+          mb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
+          mb = _mm256_and_si256(low_bits, mb);
+
+          yp += 4; cbp += 4; crp += 4;
+
+          my  = _mm256_load_si256((__m256i*)yp);
+          mcb = _mm256_load_si256((__m256i*)cbp);
+          mcr = _mm256_load_si256((__m256i*)crp);
+
+          tg = _mm256_add_epi64(mcb, mcr);
+          tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
+          tb = _mm256_add_epi64(mcb, tg);
+          tr = _mm256_add_epi64(mcr, tg);
+
+          tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
+          tr = _mm256_andnot_si256(low_bits, tr);
+          mr = _mm256_or_si256(mr, tr);
+          mr = _mm256_permute4x64_epi64(mr, _MM_SHUFFLE(3, 1, 2, 0));
+
+          tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
+          tg = _mm256_andnot_si256(low_bits, tg);
+          mg = _mm256_or_si256(mg, tg);
+          mg = _mm256_permute4x64_epi64(mg, _MM_SHUFFLE(3, 1, 2, 0));
+
+          tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
+          tb = _mm256_andnot_si256(low_bits, tb);
+          mb = _mm256_or_si256(mb, tb);
+          mb = _mm256_permute4x64_epi64(mb, _MM_SHUFFLE(3, 1, 2, 0));
+
+          _mm256_store_si256((__m256i*)rp, mr);
+          _mm256_store_si256((__m256i*)gp, mg);
+          _mm256_store_si256((__m256i*)bp, mb);
+
+          yp += 4; cbp += 4; crp += 4;
+          rp += 8; gp += 8; bp += 8;
+        }        
       }
     }
 
diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h
index 6ddf890..5eb8b74 100644
--- a/src/core/transform/ojph_colour_local.h
+++ b/src/core/transform/ojph_colour_local.h
@@ -65,8 +65,16 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                      ui32 width);
+    void gen_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
@@ -85,12 +93,14 @@ namespace ojph {
                                  ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                         si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void gen_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                          si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void gen_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     void gen_ict_forward(const float *r, const float *g, const float *b,
@@ -157,16 +167,26 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width);
+    void sse2_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void sse2_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void sse2_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -209,16 +229,26 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width);
+    void avx2_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void avx2_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void avx2_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -245,16 +275,26 @@ namespace ojph {
                                   ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width);
+    void wasm_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void wasm_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void wasm_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     void wasm_ict_forward(const float *r, const float *g, const float *b,
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index 4a3cb14..a529c66 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -35,10 +35,12 @@
 // Date: 11 October 2019
 //***************************************************************************/
 
+#include <climits>
 #include <cmath>
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 
 #include <emmintrin.h>
@@ -46,6 +48,207 @@
 namespace ojph {
   namespace local {
 
+    /////////////////////////////////////////////////////////////////////////
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) 
+    {
+      // note than m must be obtained using
+      // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
+      __m128i x = _mm_srli_epi64(a, amt);
+      x = _mm_xor_si128(x, m);
+      __m128i result = _mm_sub_epi64(x, m);
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
+    {
+      __m128i t;
+      t = _mm_cmplt_epi32(a, zero);      // get -ve
+      t = _mm_unpacklo_epi32(a, t);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
+    {
+      __m128i t;
+      t = _mm_cmplt_epi32(a, zero);      // get -ve
+      t = _mm_unpackhi_epi32(a, t);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_convert(const line_buf *src_line, 
+                          const ui32 src_line_offset,
+                          line_buf *dst_line, 
+                          const ui32 dst_line_offset, 
+                          si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m128i sh = _mm_set1_epi32((si32)shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            __m128i s = _mm_loadu_si128((__m128i*)sp);
+            s = _mm_add_epi32(s, sh);
+            _mm_storeu_si128((__m128i*)dp, s);
+          }            
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m128i zero = _mm_setzero_si128();
+          __m128i sh = _mm_set1_epi64x(shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            __m128i s, t;
+            s = _mm_loadu_si128((__m128i*)sp);
+            
+            t = sse2_cvtlo_epi32_epi64(s, zero);
+            t = _mm_add_epi64(t, sh);
+            _mm_storeu_si128((__m128i*)dp, t);
+            
+            t = sse2_cvthi_epi32_epi64(s, zero);
+            t = _mm_add_epi64(t, sh);
+            _mm_storeu_si128((__m128i*)dp + 1, t);
+          }            
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
+        __m128i sh = _mm_set1_epi64x(shift);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+        {
+          __m128i s, t;
+          s = _mm_loadu_si128((__m128i*)sp);
+          s = _mm_add_epi64(s, sh);
+
+          t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm_and_si128(low_bits, t);
+
+          s = _mm_loadu_si128((__m128i*)sp + 1);
+          s = _mm_add_epi64(s, sh);
+
+          s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
+          s = _mm_andnot_si128(low_bits, s);
+          
+          t = _mm_or_si128(s, t);
+          _mm_storeu_si128((__m128i*)dp, t);
+        }            
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_convert_nlt_type3(const line_buf *src_line, 
+                                    const ui32 src_line_offset, 
+                                    line_buf *dst_line, 
+                                    const ui32 dst_line_offset, 
+                                    si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m128i sh = _mm_set1_epi32((si32)(-shift));
+          __m128i zero = _mm_setzero_si128();
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            __m128i s = _mm_loadu_si128((__m128i*)sp);
+            __m128i c = _mm_cmplt_epi32(s, zero);  // 0xFFFFFFFF for -ve value
+            __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value 
+            v_m_sh = _mm_and_si128(c, v_m_sh);     // keep only - shift - value
+            s = _mm_andnot_si128(c, s);            // keep only +ve or 0
+            s = _mm_or_si128(s, v_m_sh);           // combine
+            _mm_storeu_si128((__m128i*)dp, s);
+          }
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m128i sh = _mm_set1_epi64x(-shift);
+          __m128i zero = _mm_setzero_si128();
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            __m128i s, t, u, c, v_m_sh;
+            s = _mm_loadu_si128((__m128i*)sp);
+
+            t = _mm_cmplt_epi32(s, zero);      // find -ve 32bit -1
+            u = _mm_unpacklo_epi32(s, t);      // correct 64bit data
+            c = _mm_unpacklo_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value 
+            v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
+            u = _mm_andnot_si128(c, u);        // keep only +ve or 0
+            u = _mm_or_si128(u, v_m_sh);       // combine
+
+            _mm_storeu_si128((__m128i*)dp, u);
+            u = _mm_unpackhi_epi32(s, t);      // correct 64bit data
+            c = _mm_unpackhi_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value 
+            v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
+            u = _mm_andnot_si128(c, u);        // keep only +ve or 0
+            u = _mm_or_si128(u, v_m_sh);       // combine
+
+            _mm_storeu_si128((__m128i*)dp + 1, u);
+          }
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m128i sh = _mm_set1_epi64x(-shift);
+        __m128i zero = _mm_setzero_si128();
+        __m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          __m128i s, t, p, n, m, tm;
+          s = _mm_loadu_si128((__m128i*)sp);
+          
+          tm = _mm_cmplt_epi32(s, zero);   // 32b -1 for -ve value
+          m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
+          tm = _mm_sub_epi64(sh, s);       // - shift - value
+          n = _mm_and_si128(m, tm);        // -ve
+          p = _mm_andnot_si128(m, s);      // +ve
+          tm = _mm_or_si128(n, p);
+          tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm_and_si128(half_mask, tm);
+
+          s = _mm_loadu_si128((__m128i*)sp + 1);
+          tm = _mm_cmplt_epi32(s, zero);   // 32b -1 for -ve value
+          m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
+          tm = _mm_sub_epi64(sh, s);       // - shift - value
+          n = _mm_and_si128(m, tm);        // -ve
+          p = _mm_andnot_si128(m, s);      // +ve
+          tm = _mm_or_si128(n, p);
+          tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
+          tm = _mm_andnot_si128(half_mask, tm);
+
+          t = _mm_or_si128(t, tm);
+           _mm_storeu_si128((__m128i*)dp, t);
+        }
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
                                        ui32 width)
@@ -80,64 +283,200 @@ namespace ojph {
       _MM_SET_ROUNDING_MODE(rounding_mode);
     }
 
-
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width)
+    void sse2_rct_forward(const line_buf *r, 
+                          const line_buf *g, 
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr, 
+                          ui32 repeat)
     {
-      __m128i sh = _mm_set1_epi32(shift);
-      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+      
+      if  (y->flags & line_buf::LFT_32BIT)
       {
-        __m128i s = _mm_loadu_si128((__m128i*)sp);
-        s = _mm_add_epi32(s, sh);
-        _mm_storeu_si128((__m128i*)dp, s);
-      }
-    }
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));        
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i mr = _mm_load_si128((__m128i*)rp);
+          __m128i mg = _mm_load_si128((__m128i*)gp);
+          __m128i mb = _mm_load_si128((__m128i*)bp);
+          __m128i t = _mm_add_epi32(mr, mb);
+          t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
+          _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2));
+          t = _mm_sub_epi32(mb, mg);
+          _mm_store_si128((__m128i*)cbp, t);
+          t = _mm_sub_epi32(mr, mg);
+          _mm_store_si128((__m128i*)crp, t);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat)
-    {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+          rp += 4; gp += 4; bp += 4;
+          yp += 4; cbp += 4; crp += 4;
+        }
+      }
+      else 
       {
-        __m128i mr = _mm_load_si128((__m128i*)r);
-        __m128i mg = _mm_load_si128((__m128i*)g);
-        __m128i mb = _mm_load_si128((__m128i*)b);
-        __m128i t = _mm_add_epi32(mr, mb);
-        t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
-        _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2));
-        t = _mm_sub_epi32(mb, mg);
-        _mm_store_si128((__m128i*)cb, t);
-        t = _mm_sub_epi32(mr, mg);
-        _mm_store_si128((__m128i*)cr, t);
-
-        r += 4; g += 4; b += 4;
-        y += 4; cb += 4; cr += 4;
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        __m128i zero = _mm_setzero_si128();
+        __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i mr32 = _mm_load_si128((__m128i*)rp);
+          __m128i mg32 = _mm_load_si128((__m128i*)gp);
+          __m128i mb32 = _mm_load_si128((__m128i*)bp);
+          __m128i mr, mg, mb, t;
+          mr = sse2_cvtlo_epi32_epi64(mr32, zero);
+          mg = sse2_cvtlo_epi32_epi64(mg32, zero);
+          mb = sse2_cvtlo_epi32_epi64(mb32, zero);
+          
+          t = _mm_add_epi64(mr, mb);
+          t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
+          _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
+          t = _mm_sub_epi64(mb, mg);
+          _mm_store_si128((__m128i*)cbp, t);
+          t = _mm_sub_epi64(mr, mg);
+          _mm_store_si128((__m128i*)crp, t);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          mr = sse2_cvthi_epi32_epi64(mr32, zero);
+          mg = sse2_cvthi_epi32_epi64(mg32, zero);
+          mb = sse2_cvthi_epi32_epi64(mb32, zero);
+          
+          t = _mm_add_epi64(mr, mb);
+          t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
+          _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
+          t = _mm_sub_epi64(mb, mg);
+          _mm_store_si128((__m128i*)cbp, t);
+          t = _mm_sub_epi64(mr, mg);
+          _mm_store_si128((__m128i*)crp, t);
+
+          rp += 4; gp += 4; bp += 4;
+          yp += 2; cbp += 2; crp += 2;
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void sse2_rct_backward(const line_buf *y, 
+                           const line_buf *cb, 
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b, 
+                           ui32 repeat)
     {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+
+      if (y->flags & line_buf::LFT_32BIT)
       {
-        __m128i my  = _mm_load_si128((__m128i*)y);
-        __m128i mcb = _mm_load_si128((__m128i*)cb);
-        __m128i mcr = _mm_load_si128((__m128i*)cr);
-
-        __m128i t = _mm_add_epi32(mcb, mcr);
-        t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
-        _mm_store_si128((__m128i*)g, t);
-        __m128i u = _mm_add_epi32(mcb, t);
-        _mm_store_si128((__m128i*)b, u);
-        u = _mm_add_epi32(mcr, t);
-        _mm_store_si128((__m128i*)r, u);
-
-        y += 4; cb += 4; cr += 4;
-        r += 4; g += 4; b += 4;
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i my  = _mm_load_si128((__m128i*)yp);
+          __m128i mcb = _mm_load_si128((__m128i*)cbp);
+          __m128i mcr = _mm_load_si128((__m128i*)crp);
+
+          __m128i t = _mm_add_epi32(mcb, mcr);
+          t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
+          _mm_store_si128((__m128i*)gp, t);
+          __m128i u = _mm_add_epi32(mcb, t);
+          _mm_store_si128((__m128i*)bp, u);
+          u = _mm_add_epi32(mcr, t);
+          _mm_store_si128((__m128i*)rp, u);
+
+          yp += 4; cbp += 4; crp += 4;
+          rp += 4; gp += 4; bp += 4;
+        }        
       }
-    }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
+        __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i my, mcb, mcr, tr, tg, tb;          
+          my  = _mm_load_si128((__m128i*)yp);
+          mcb = _mm_load_si128((__m128i*)cbp);
+          mcr = _mm_load_si128((__m128i*)crp);
+
+          tg = _mm_add_epi64(mcb, mcr);
+          tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
+          tb = _mm_add_epi64(mcb, tg);
+          tr = _mm_add_epi64(mcr, tg);
+
+          __m128i mr, mg, mb;
+          mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
+          mr = _mm_and_si128(low_bits, mr);
+          mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
+          mg = _mm_and_si128(low_bits, mg);
+          mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
+          mb = _mm_and_si128(low_bits, mb);
 
+          yp += 2; cbp += 2; crp += 2;
+
+          my  = _mm_load_si128((__m128i*)yp);
+          mcb = _mm_load_si128((__m128i*)cbp);
+          mcr = _mm_load_si128((__m128i*)crp);
+
+          tg = _mm_add_epi64(mcb, mcr);
+          tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
+          tb = _mm_add_epi64(mcb, tg);
+          tr = _mm_add_epi64(mcr, tg);
+
+          tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
+          tr = _mm_andnot_si128(low_bits, tr);
+          mr = _mm_or_si128(mr, tr);
+          tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
+          tg = _mm_andnot_si128(low_bits, tg);
+          mg = _mm_or_si128(mg, tg);
+          tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
+          tb = _mm_andnot_si128(low_bits, tb);
+          mb = _mm_or_si128(mb, tb);
+
+          _mm_store_si128((__m128i*)rp, mr);
+          _mm_store_si128((__m128i*)gp, mg);
+          _mm_store_si128((__m128i*)bp, mb);
+
+          yp += 2; cbp += 2; crp += 2;
+          rp += 4; gp += 4; bp += 4;
+        }        
+      }
+    }
   }
 }
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index 632a645..5bf6ccd 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -39,12 +39,164 @@
 #include <wasm_simd128.h>
 
 #include "ojph_defs.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 #include "ojph_colour_local.h"
 
 namespace ojph {
   namespace local {
     
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_convert(const line_buf *src_line, 
+                          const ui32 src_line_offset,
+                          line_buf *dst_line, 
+                          const ui32 dst_line_offset, 
+                          si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          v128_t sh = wasm_i32x4_splat((si32)shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            v128_t s = wasm_v128_load(sp);
+            s = wasm_i32x4_add(s, sh);
+            wasm_v128_store(dp, s);
+          }            
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          v128_t sh = wasm_i64x2_splat(shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            v128_t s, t;
+            s = wasm_v128_load(sp);
+            
+            t = wasm_i64x2_extend_low_i32x4(s);
+            t = wasm_i64x2_add(t, sh);
+            wasm_v128_store(dp, t);
+            
+            t = wasm_i64x2_extend_high_i32x4(s);
+            t = wasm_i64x2_add(t, sh);
+            wasm_v128_store(dp + 2, t);
+          }            
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        v128_t sh = wasm_i64x2_splat(shift);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+        {
+          v128_t s0, s1;
+          s0 = wasm_v128_load(sp);
+          s0 = wasm_i64x2_add(s0, sh);
+          s1 = wasm_v128_load(sp + 2);
+          s1 = wasm_i64x2_add(s1, sh);
+          s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
+          wasm_v128_store(dp, s0);
+        }            
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_convert_nlt_type3(const line_buf *src_line, 
+                                    const ui32 src_line_offset, 
+                                    line_buf *dst_line, 
+                                    const ui32 dst_line_offset, 
+                                    si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          v128_t sh = wasm_i32x4_splat((si32)(-shift));
+          v128_t zero = wasm_i32x4_splat(0);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            v128_t s = wasm_v128_load(sp);
+            v128_t c = wasm_i32x4_lt(s, zero);     // 0xFFFFFFFF for -ve value
+            v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value 
+            v_m_sh = wasm_v128_and(c, v_m_sh);     // keep only - shift - value
+            s = wasm_v128_andnot(c, s);            // keep only +ve or 0
+            s = wasm_v128_or(s, v_m_sh);           // combine
+            wasm_v128_store(dp, s);
+          }
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          v128_t sh = wasm_i64x2_splat(-shift);
+          v128_t zero = wasm_i32x4_splat(0);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            v128_t s, u, c, v_m_sh;
+            s = wasm_v128_load(sp);
+
+            u = wasm_i64x2_extend_low_i32x4(s);
+            c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
+            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value 
+            v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
+            u = wasm_v128_andnot(c, u);        // keep only +ve or 0
+            u = wasm_v128_or(u, v_m_sh);       // combine
+
+            wasm_v128_store(dp, u);
+
+            u = wasm_i64x2_extend_high_i32x4(s);
+            c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
+            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value 
+            v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
+            u = wasm_v128_andnot(c, u);        // keep only +ve or 0
+            u = wasm_v128_or(u, v_m_sh);       // combine
+
+            wasm_v128_store(dp + 2, u);
+          }
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        v128_t sh = wasm_i64x2_splat(-shift);
+        v128_t zero = wasm_i32x4_splat(0);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          v128_t s, t0, t1, p, n, m, tm;
+          s = wasm_v128_load(sp);
+          m = wasm_i64x2_lt(s, zero);   // 64b -1 for -ve value
+          tm = wasm_i64x2_sub(sh, s);   // - shift - value
+          n = wasm_v128_and(m, tm);     // -ve
+          p = wasm_v128_andnot(m, s);   // +ve
+          t0 = wasm_v128_or(n, p);
+
+          s = wasm_v128_load(sp + 2);
+          m = wasm_i64x2_lt(s, zero);   // 64b -1 for -ve value
+          tm = wasm_i64x2_sub(sh, s);   // - shift - value
+          n = wasm_v128_and(m, tm);     // -ve
+          p = wasm_v128_andnot(m, s);   // +ve
+          t1 = wasm_v128_or(n, p);
+
+          t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
+          wasm_v128_store(dp, t0);
+        }
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
                                         ui32 width)
@@ -108,62 +260,182 @@ namespace ojph {
       }
     }
 
-
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width)
+    void wasm_rct_forward(const line_buf *r, 
+                          const line_buf *g, 
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr, 
+                          ui32 repeat)
     {
-      v128_t sh = wasm_i32x4_splat(shift);
-      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+      
+      if  (y->flags & line_buf::LFT_32BIT)
       {
-        v128_t s = wasm_v128_load(sp);
-        s = wasm_i32x4_add(s, sh);
-        wasm_v128_store(dp, s);
-      }
-    }
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));        
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat)
-    {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t mr = wasm_v128_load(rp);
+          v128_t mg = wasm_v128_load(gp);
+          v128_t mb = wasm_v128_load(bp);
+          v128_t t = wasm_i32x4_add(mr, mb);
+          t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
+          wasm_v128_store(yp, wasm_i32x4_shr(t, 2));
+          t = wasm_i32x4_sub(mb, mg);
+          wasm_v128_store(cbp, t);
+          t = wasm_i32x4_sub(mr, mg);
+          wasm_v128_store(crp, t);
+
+            rp += 4; gp += 4; bp += 4;
+            yp += 4; cbp += 4; crp += 4;
+        }
+      }
+      else 
       {
-        v128_t mr = wasm_v128_load(r);
-        v128_t mg = wasm_v128_load(g);
-        v128_t mb = wasm_v128_load(b);
-        v128_t t = wasm_i32x4_add(mr, mb);
-        t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
-        wasm_v128_store(y, wasm_i32x4_shr(t, 2));
-        t = wasm_i32x4_sub(mb, mg);
-        wasm_v128_store(cb, t);
-        t = wasm_i32x4_sub(mr, mg);
-        wasm_v128_store(cr, t);
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t mr32 = wasm_v128_load(rp);
+          v128_t mg32 = wasm_v128_load(gp);
+          v128_t mb32 = wasm_v128_load(bp);
+          v128_t mr, mg, mb, t;
+          mr = wasm_i64x2_extend_low_i32x4(mr32);
+          mg = wasm_i64x2_extend_low_i32x4(mg32);
+          mb = wasm_i64x2_extend_low_i32x4(mb32);
+          
+          t = wasm_i64x2_add(mr, mb);
+          t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
+          wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
+          t = wasm_i64x2_sub(mb, mg);
+          wasm_v128_store(cbp, t);
+          t = wasm_i64x2_sub(mr, mg);
+          wasm_v128_store(crp, t);
 
-        r += 4; g += 4; b += 4;
-        y += 4; cb += 4; cr += 4;
+          yp += 2; cbp += 2; crp += 2;
+
+          mr = wasm_i64x2_extend_high_i32x4(mr32);
+          mg = wasm_i64x2_extend_high_i32x4(mg32);
+          mb = wasm_i64x2_extend_high_i32x4(mb32);
+          
+          t = wasm_i64x2_add(mr, mb);
+          t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
+          wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
+          t = wasm_i64x2_sub(mb, mg);
+          wasm_v128_store(cbp, t);
+          t = wasm_i64x2_sub(mr, mg);
+          wasm_v128_store(crp, t);
+
+          rp += 4; gp += 4; bp += 4;
+          yp += 2; cbp += 2; crp += 2;
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void wasm_rct_backward(const line_buf *y, 
+                           const line_buf *cb, 
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b, 
+                           ui32 repeat)
     {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+
+      if (y->flags & line_buf::LFT_32BIT)
       {
-        v128_t my  = wasm_v128_load(y);
-        v128_t mcb = wasm_v128_load(cb);
-        v128_t mcr = wasm_v128_load(cr);
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t my  = wasm_v128_load(yp);
+          v128_t mcb = wasm_v128_load(cbp);
+          v128_t mcr = wasm_v128_load(crp);
 
-        v128_t t = wasm_i32x4_add(mcb, mcr);
-        t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
-        wasm_v128_store(g, t);
-        v128_t u = wasm_i32x4_add(mcb, t);
-        wasm_v128_store(b, u);
-        u = wasm_i32x4_add(mcr, t);
-        wasm_v128_store(r, u);
+          v128_t t = wasm_i32x4_add(mcb, mcr);
+          t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
+          wasm_v128_store(gp, t);
+          v128_t u = wasm_i32x4_add(mcb, t);
+          wasm_v128_store(bp, u);
+          u = wasm_i32x4_add(mcr, t);
+          wasm_v128_store(rp, u);
 
-        y += 4; cb += 4; cr += 4;
-        r += 4; g += 4; b += 4;
+          yp += 4; cbp += 4; crp += 4;
+          rp += 4; gp += 4; bp += 4;
+        }
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
+          my  = wasm_v128_load(yp);
+          mcb = wasm_v128_load(cbp);
+          mcr = wasm_v128_load(crp);
+
+          tg0 = wasm_i64x2_add(mcb, mcr);
+          tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2));
+          tb0 = wasm_i64x2_add(mcb, tg0);
+          tr0 = wasm_i64x2_add(mcr, tg0);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          my  = wasm_v128_load(yp);
+          mcb = wasm_v128_load(cbp);
+          mcr = wasm_v128_load(crp);
+
+          tg1 = wasm_i64x2_add(mcb, mcr);
+          tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2));
+          tb1 = wasm_i64x2_add(mcb, tg1);
+          tr1 = wasm_i64x2_add(mcr, tg1);
+
+          tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2);
+          tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2);
+          tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2);
+
+          wasm_v128_store(rp, tr0);
+          wasm_v128_store(gp, tg0);
+          wasm_v128_store(bp, tb0);
+
+          yp += 2; cbp += 2; crp += 2;
+          rp += 4; gp += 4; bp += 4;
+        }        
       }
     }
 
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index ee4bb08..c4313ab 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -45,7 +45,9 @@
 #include "../codestream/ojph_params_local.h"
 
 namespace ojph {
-  struct line_buf;
+
+  // defined elsewhere
+  class line_buf;
 
   namespace local {
 
@@ -156,9 +158,9 @@ namespace ojph {
       #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512))
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512)
         {
-          rev_vert_step             = avx512_rev_vert_step;
-          rev_horz_ana              = avx512_rev_horz_ana;
-          rev_horz_syn              = avx512_rev_horz_syn;
+          // rev_vert_step             = avx512_rev_vert_step;
+          // rev_horz_ana              = avx512_rev_horz_ana;
+          // rev_horz_syn              = avx512_rev_horz_syn;
 
           irv_vert_step             = avx512_irv_vert_step;
           irv_vert_times_K          = avx512_irv_vert_times_K;
@@ -192,13 +194,14 @@ namespace ojph {
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                           const line_buf* other, const line_buf* aug, 
-                           ui32 repeat, bool synthesis)
+    static
+    void gen_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                             const line_buf* other, const line_buf* aug, 
+                             ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const ui32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
 
       si32* dst = aug->i32;
       const si32* src1 = sig->i32, * src2 = other->i32;
@@ -243,9 +246,85 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                          const line_buf* hdst, const line_buf* src, 
-                          ui32 width, bool even)
+    static
+    void gen_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                             const line_buf* other, const line_buf* aug, 
+                             ui32 repeat, bool synthesis)
+    {
+      const si64 a = s->rev.Aatk;
+      const si64 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + *src1++ + *src2++) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + *src1++ + *src2++) >> e;
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (*src1++ + *src2++) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (*src1++ + *src2++) >> e;
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b - (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b - (*src1++ + *src2++)) >> e;
+      }
+      else { // general case
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        gen_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        gen_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                            const line_buf* hdst, const line_buf* src, 
+                            ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -277,7 +356,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
 
           // extension
           lp[-1] = lp[0];
@@ -319,11 +398,111 @@ namespace ojph {
           hdst->i32[0] = src->i32[0] << 1;
       }
     }
-    
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                          const line_buf* lsrc, const line_buf* hsrc, 
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                            const line_buf* hdst, const line_buf* src, 
+                            ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // combine both lsrc and hsrc into dst
+        si64* dph = hdst->i64;
+        si64* dpl = ldst->i64;
+        si64* sp = src->i64;
+        ui32 w = width;
+        if (!even)
+        {
+          *dph++ = *sp++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dpl++ = *sp++; *dph++ = *sp++;
+        }
+        if (w)
+        {
+          *dpl++ = *sp++; --w;
+        }
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si64 a = s->rev.Aatk;
+          const si64 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp + (even ? 1 : 0);
+          si64* dp = hp;
+          if (a == 1) 
+          { // 5/3 update and any case with a == 1
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b + (sp[-1] + sp[0])) >> e;
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp -= (sp[-1] + sp[0]) >> e;
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
                           ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        gen_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        gen_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                            const line_buf* lsrc, const line_buf* hsrc, 
+                            ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -337,7 +516,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
 
           // extension
           oth[-1] = oth[0];
@@ -398,6 +577,105 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                            const line_buf* lsrc, const line_buf* hsrc, 
+                            ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si64 a = s->rev.Aatk;
+          const si64 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth + (ev ? 0 : 1);
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b + (sp[-1] + sp[0])) >> e;
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp += (sp[-1] + sp[0]) >> e;
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        si64* sph = hsrc->i64;
+        si64* spl = lsrc->i64;
+        si64* dp = dst->i64;
+        ui32 w = width;
+        if (!even)
+        {
+          *dp++ = *sph++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dp++ = *spl++; *dp++ = *sph++;
+        }
+        if (w)
+        {
+          *dp++ = *spl++; --w;
+        }
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        gen_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        gen_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }    
+
     //////////////////////////////////////////////////////////////////////////
     void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, 
                            const line_buf* other, const line_buf* aug, 
diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h
index 0e59632..f7576a1 100644
--- a/src/core/transform/ojph_transform.h
+++ b/src/core/transform/ojph_transform.h
@@ -42,7 +42,10 @@
 #include "ojph_defs.h"
 
 namespace ojph {
-  struct line_buf;
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
     union lifting_step;
     struct param_atk;
diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index 0856662..8838d18 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -61,6 +61,40 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void avx_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
+      {
+        __m256 a = _mm256_load_ps(sp);
+        __m256 b = _mm256_load_ps(sp + 8);
+        __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+        __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+        __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+        __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm256_store_ps(dpl, e);
+        _mm256_store_ps(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx_interleave32(float* dp, float* spl, float* sph, int width)
+    {
+      for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m256 a = _mm256_load_ps(spl);
+        __m256 b = _mm256_load_ps(sph);
+        __m256 c = _mm256_unpacklo_ps(a, b);
+        __m256 d = _mm256_unpackhi_ps(a, b);
+        __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+        __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+        _mm256_store_ps(dp, e);
+        _mm256_store_ps(dp + 8, f);
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, 
                            const line_buf* other, const line_buf* aug, 
@@ -100,11 +134,11 @@ namespace ojph {
       {
         // split src into ldst and hdst
         {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
           float* sp = src->f32;
           int w = (int)width;
-          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
+          avx_deinterleave32(dpl, dph, sp, w);
         }
 
         // the actual horizontal transform
@@ -235,10 +269,10 @@ namespace ojph {
         // combine both lsrc and hsrc into dst
         {
           float* dp = dst->f32;
-          float* spl = lsrc->f32;
-          float* sph = hsrc->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
           int w = (int)width;
-          AVX_INTERLEAVE(dp, spl, sph, w, even);
+          avx_interleave32(dp, spl, sph, w);
         }
       }
       else {
diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp
index 847cd4c..1bc92e6 100644
--- a/src/core/transform/ojph_transform_avx2.cpp
+++ b/src/core/transform/ojph_transform_avx2.cpp
@@ -35,6 +35,7 @@
 // Date: 28 August 2019
 //***************************************************************************/
 
+#include <climits>
 #include <cstdio>
 
 #include "ojph_defs.h"
@@ -52,13 +53,95 @@ namespace ojph {
   namespace local {
 
     /////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                            const line_buf* other, const line_buf* aug, 
-                            ui32 repeat, bool synthesis)
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline 
+    __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) 
+    {
+      // note than m must be obtained using
+      // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
+      __m256i x = _mm256_srli_epi64(a, amt);
+      x = _mm256_xor_si256(x, m);
+      __m256i result = _mm256_sub_epi64(x, m);
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void avx2_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
+      {
+        __m256 a = _mm256_load_ps(sp);
+        __m256 b = _mm256_load_ps(sp + 8);
+        __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+        __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+        __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+        __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm256_store_ps(dpl, e);
+        _mm256_store_ps(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx2_interleave32(float* dp, float* spl, float* sph, int width)
+    {
+      for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m256 a = _mm256_load_ps(spl);
+        __m256 b = _mm256_load_ps(sph);
+        __m256 c = _mm256_unpacklo_ps(a, b);
+        __m256 d = _mm256_unpackhi_ps(a, b);
+        __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+        __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+        _mm256_store_ps(dp, e);
+        _mm256_store_ps(dp + 8, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx2_deinterleave64(double* dpl, double* dph, double* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m256d a = _mm256_load_pd(sp);
+        __m256d b = _mm256_load_pd(sp + 4);
+        __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
+        __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
+        __m256d e = _mm256_shuffle_pd(c, d, 0x0);
+        __m256d f = _mm256_shuffle_pd(c, d, 0xF);
+        _mm256_store_pd(dpl, e);
+        _mm256_store_pd(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx2_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m256d a = _mm256_load_pd(spl);
+        __m256d b = _mm256_load_pd(sph);
+        __m256d c = _mm256_unpacklo_pd(a, b);
+        __m256d d = _mm256_unpackhi_pd(a, b);
+        __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
+        __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
+        _mm256_store_pd(dp, e);
+        _mm256_store_pd(dp + 4, f);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const si32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
       __m256i va = _mm256_set1_epi32(a);
       __m256i vb = _mm256_set1_epi32(b);
 
@@ -181,19 +264,154 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                           const line_buf* hdst, const line_buf* src, 
-                           ui32 width, bool even)
+    static
+    void avx2_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m256i vb = _mm256_set1_epi64x(b);
+      __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));      
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_add_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_sub_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_add_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_add_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+            d = _mm256_add_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+            d = _mm256_sub_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_sub_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_sub_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_sub_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_add_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else { // general case
+        // 64bit multiplication is not supported in avx2;
+        // in particular, _mm256_mullo_epi64.
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        avx2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        avx2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        // combine both lsrc and hsrc into dst
+        // split src into ldst and hdst
         {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
-          float* sp = src->f32;
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp  = src->f32;
           int w = (int)width;
-          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
+          avx2_deinterleave32(dpl, dph, sp, w);
         }
 
         si32* hp = hdst->i32, * lp = ldst->i32;
@@ -206,7 +424,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui8 e  = s->rev.Eatk;
           __m256i va = _mm256_set1_epi32(a);
           __m256i vb = _mm256_set1_epi32(b);
 
@@ -346,11 +564,181 @@ namespace ojph {
           hdst->i32[0] = src->i32[0] << 1;
       }
     }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          avx2_deinterleave64(dpl, dph, sp, w);
+        }
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e  = s->rev.Eatk;
+          __m256i vb = _mm256_set1_epi64x(b);
+          __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in avx2;
+            // in particular, _mm256_mullo_epi64.
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        avx2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        avx2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
     
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                           const line_buf* lsrc, const line_buf* hsrc, 
-                           ui32 width, bool even)
+    static
+    void avx2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -364,7 +752,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui8 e  = s->rev.Eatk;
           __m256i va = _mm256_set1_epi32(a);
           __m256i vb = _mm256_set1_epi32(b);
 
@@ -499,11 +887,11 @@ namespace ojph {
 
         // combine both lsrc and hsrc into dst
         {
-          float* dp = dst->f32;
-          float* spl = lsrc->f32;
-          float* sph = hsrc->f32;
+          float* dp  = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
           int w = (int)width;
-          AVX_INTERLEAVE(dp, spl, sph, w, even);
+          avx2_interleave32(dp, spl, sph, w);
         }
       }
       else {
@@ -514,5 +902,174 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e  = s->rev.Eatk;
+          __m256i vb = _mm256_set1_epi64x(b);
+          __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));      
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in avx2;
+            // in particular, _mm_mullo_epi64.
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          avx2_interleave64(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }    
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        avx2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        avx2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }
+
   } // !local
 } // !ojph
diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp
index 504aa87..0e92230 100644
--- a/src/core/transform/ojph_transform_avx512.cpp
+++ b/src/core/transform/ojph_transform_avx512.cpp
@@ -54,8 +54,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     // We split multiples of 32 followed by multiples of 16, because
     // we assume byte_alignment == 64
-    static void avx512_deinterleave(float* dpl, float* dph, float* sp, 
-                                    int width, bool even)
+    static 
+    void avx512_deinterleave32(float* dpl, float* dph, float* sp, int width)
     {
       __m512i idx1 = _mm512_set_epi32(
         0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10,
@@ -65,59 +65,33 @@ namespace ojph {
         0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11,
         0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
       );
-      if (even)
+      for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
       {
-        for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
-        {
-          __m512 a = _mm512_load_ps(sp);
-          __m512 b = _mm512_load_ps(sp + 16);
-          __m512 c = _mm512_permutex2var_ps(a, idx1, b);
-          __m512 d = _mm512_permutex2var_ps(a, idx2, b);
-          _mm512_store_ps(dpl, c);
-          _mm512_store_ps(dph, d);
-        }
-        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
-        {
-          __m256 a = _mm256_load_ps(sp);
-          __m256 b = _mm256_load_ps(sp + 8);
-          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
-          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
-          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
-          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
-          _mm256_store_ps(dpl, e);
-          _mm256_store_ps(dph, f);
-        }
+        __m512 a = _mm512_load_ps(sp);
+        __m512 b = _mm512_load_ps(sp + 16);
+        __m512 c = _mm512_permutex2var_ps(a, idx1, b);
+        __m512 d = _mm512_permutex2var_ps(a, idx2, b);
+        _mm512_store_ps(dpl, c);
+        _mm512_store_ps(dph, d);
       }
-      else
+      for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
       {
-        for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
-        {
-          __m512 a = _mm512_load_ps(sp);
-          __m512 b = _mm512_load_ps(sp + 16);
-          __m512 c = _mm512_permutex2var_ps(a, idx2, b);
-          __m512 d = _mm512_permutex2var_ps(a, idx1, b);
-          _mm512_store_ps(dpl, c);
-          _mm512_store_ps(dph, d);
-        }
-        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
-        {
-          __m256 a = _mm256_load_ps(sp);
-          __m256 b = _mm256_load_ps(sp + 8);
-          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
-          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
-          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
-          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
-          _mm256_store_ps(dpl, f);
-          _mm256_store_ps(dph, e);
-        }
+        __m256 a = _mm256_load_ps(sp);
+        __m256 b = _mm256_load_ps(sp + 8);
+        __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+        __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+        __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+        __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm256_store_ps(dpl, e);
+        _mm256_store_ps(dph, f);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
     // We split multiples of 32 followed by multiples of 16, because
     // we assume byte_alignment == 64
-    static void avx512_interleave(float* dp, float* spl, float* sph,
-                                  int width, bool even)
+    static 
+    void avx512_interleave32(float* dp, float* spl, float* sph, int width)
     {
       __m512i idx1 = _mm512_set_epi32(
         0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4,
@@ -127,51 +101,93 @@ namespace ojph {
         0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC,
         0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8
       );
-      if (even)
+      for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
       {
-        for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
-        {
-          __m512 a = _mm512_load_ps(spl);
-          __m512 b = _mm512_load_ps(sph);
-          __m512 c = _mm512_permutex2var_ps(a, idx1, b);
-          __m512 d = _mm512_permutex2var_ps(a, idx2, b);
-          _mm512_store_ps(dp, c);
-          _mm512_store_ps(dp + 16, d);
-        }
-        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
-        {
-          __m256 a = _mm256_load_ps(spl);
-          __m256 b = _mm256_load_ps(sph);
-          __m256 c = _mm256_unpacklo_ps(a, b);
-          __m256 d = _mm256_unpackhi_ps(a, b);
-          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
-          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
-          _mm256_store_ps(dp, e);
-          _mm256_store_ps(dp + 8, f);
-        }
+        __m512 a = _mm512_load_ps(spl);
+        __m512 b = _mm512_load_ps(sph);
+        __m512 c = _mm512_permutex2var_ps(a, idx1, b);
+        __m512 d = _mm512_permutex2var_ps(a, idx2, b);
+        _mm512_store_ps(dp, c);
+        _mm512_store_ps(dp + 16, d);
       }
-      else
+      for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
       {
-        for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
-        {
-          __m512 a = _mm512_load_ps(spl);
-          __m512 b = _mm512_load_ps(sph);
-          __m512 c = _mm512_permutex2var_ps(b, idx1, a);
-          __m512 d = _mm512_permutex2var_ps(b, idx2, a);
-          _mm512_store_ps(dp, c);
-          _mm512_store_ps(dp + 16, d);
-        }
-        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
-        {
-          __m256 a = _mm256_load_ps(spl);
-          __m256 b = _mm256_load_ps(sph);
-          __m256 c = _mm256_unpacklo_ps(b, a);
-          __m256 d = _mm256_unpackhi_ps(b, a);
-          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
-          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
-          _mm256_store_ps(dp, e);
-          _mm256_store_ps(dp + 8, f);
-        }
+        __m256 a = _mm256_load_ps(spl);
+        __m256 b = _mm256_load_ps(sph);
+        __m256 c = _mm256_unpacklo_ps(a, b);
+        __m256 d = _mm256_unpackhi_ps(a, b);
+        __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+        __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+        _mm256_store_ps(dp, e);
+        _mm256_store_ps(dp + 8, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static void avx512_deinterleave64(double* dpl, double* dph, double* sp, 
+                                      int width)
+    {
+      __m512i idx1 = _mm512_set_epi64(
+        0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
+      );
+      __m512i idx2 = _mm512_set_epi64(
+        0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
+      );
+      for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8)
+      {
+        __m512d a = _mm512_load_pd(sp);
+        __m512d b = _mm512_load_pd(sp + 16);
+        __m512d c = _mm512_permutex2var_pd(a, idx1, b);
+        __m512d d = _mm512_permutex2var_pd(a, idx2, b);
+        _mm512_store_pd(dpl, c);
+        _mm512_store_pd(dph, d);
+      }
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m256d a = _mm256_load_pd(sp);
+        __m256d b = _mm256_load_pd(sp + 4);
+        __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
+        __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
+        __m256d e = _mm256_shuffle_pd(c, d, 0x0);
+        __m256d f = _mm256_shuffle_pd(c, d, 0xF);
+        _mm256_store_pd(dpl, e);
+        _mm256_store_pd(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static void avx512_interleave64(double* dp, double* spl, double* sph, 
+                                    int width)
+    {
+      __m512i idx1 = _mm512_set_epi64(
+        0xB, 0x3, 0xA, 0x2, 0x9, 0x1, 0x8, 0x0
+      );
+      __m512i idx2 = _mm512_set_epi64(
+        0xF, 0x7, 0xE, 0x6, 0xD, 0x5, 0xC, 0x4
+      );
+      for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m512d a = _mm512_load_pd(spl);
+        __m512d b = _mm512_load_pd(sph);
+        __m512d c = _mm512_permutex2var_pd(a, idx1, b);
+        __m512d d = _mm512_permutex2var_pd(a, idx2, b);
+        _mm512_store_pd(dp, c);
+        _mm512_store_pd(dp + 16, d);
+      }
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m256d a = _mm256_load_pd(spl);
+        __m256d b = _mm256_load_pd(sph);
+        __m256d c = _mm256_unpacklo_pd(a, b);
+        __m256d d = _mm256_unpackhi_pd(a, b);
+        __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
+        __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
+        _mm256_store_pd(dp, e);
+        _mm256_store_pd(dp + 4, f);
       }
     }
 
@@ -224,7 +240,13 @@ namespace ojph {
       if (width > 1)
       {
         // split src into ldst and hdst
-        avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp  = src->f32;
+          int w = (int)width;
+          avx512_deinterleave32(dpl, dph, sp, w);
+        }
 
         // the actual horizontal transform
         float* hp = hdst->f32, * lp = ldst->f32;
@@ -352,7 +374,13 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+        {
+          float* dp  = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          avx512_interleave32(dp, spl, sph, w);
+        }        
       }
       else {
         if (even)
@@ -364,13 +392,13 @@ namespace ojph {
 
 
     /////////////////////////////////////////////////////////////////////////
-    void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                              const line_buf* other, const line_buf* aug, 
-                              ui32 repeat, bool synthesis)
+    void avx512_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const ui32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
       __m512i va = _mm512_set1_epi32(a);
       __m512i vb = _mm512_set1_epi32(b);
 
@@ -493,14 +521,185 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                             const line_buf* hdst, const line_buf* src, 
-                             ui32 width, bool even)
+    void avx512_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m512i vb = _mm512_set1_epi64(b);
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_add_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_sub_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_add_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_add_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i w = _mm512_srai_epi64(t, e);
+            d = _mm512_add_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i w = _mm512_srai_epi64(t, e);
+            d = _mm512_sub_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_sub_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_sub_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_sub_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_add_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else { 
+        // general case
+        // 64bit multiplication is not supported in AVX512F + AVX512CD;
+        // in particular, _mm256_mullo_epi64.
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+
+      // This can only be used if you have AVX512DQ
+      // { // general case
+      //   __m512i va = _mm512_set1_epi64(a);
+      //   int i = (int)repeat;
+      //   if (synthesis)
+      //     for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+      //     {
+      //       __m512i s1 = _mm512_load_si512((__m512i*)src1);
+      //       __m512i s2 = _mm512_load_si512((__m512i*)src2);
+      //       __m512i d = _mm512_load_si512((__m512i*)dst);
+      //       __m512i t = _mm512_add_epi64(s1, s2);
+      //       __m512i u = _mm512_mullo_epi64(va, t);
+      //       __m512i v = _mm512_add_epi64(vb, u);
+      //       __m512i w = _mm512_srai_epi64(v, e);
+      //       d = _mm512_sub_epi64(d, w);
+      //       _mm512_store_si512((__m512i*)dst, d);
+      //     }
+      //   else
+      //     for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+      //     {
+      //       __m512i s1 = _mm512_load_si512((__m512i*)src1);
+      //       __m512i s2 = _mm512_load_si512((__m512i*)src2);
+      //       __m512i d = _mm512_load_si512((__m512i*)dst);
+      //       __m512i t = _mm512_add_epi64(s1, s2);
+      //       __m512i u = _mm512_mullo_epi64(va, t);
+      //       __m512i v = _mm512_add_epi64(vb, u);
+      //       __m512i w = _mm512_srai_epi64(v, e);
+      //       d = _mm512_add_epi64(d, w);
+      //       _mm512_store_si512((__m512i*)dst, d);
+      //     }
+      // }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        avx512_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        avx512_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                               const line_buf* hdst, const line_buf* src, 
+                               ui32 width, bool even)
     {
       if (width > 1)
       {
-        // combine both lsrc and hsrc into dst
-        avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
+        // split src into ldst and hdst
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp  = src->f32;
+          int w = (int)width;
+          avx512_deinterleave32(dpl, dph, sp, w);
+        }        
 
         si32* hp = hdst->i32, * lp = ldst->i32;
         ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
@@ -512,7 +711,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           __m512i va = _mm512_set1_epi32(a);
           __m512i vb = _mm512_set1_epi32(b);
 
@@ -653,10 +852,211 @@ namespace ojph {
       }
     }
     
-    //////////////////////////////////////////////////////////////////////////
-    void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                             const line_buf* lsrc, const line_buf* hsrc, 
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                               const line_buf* hdst, const line_buf* src, 
+                               ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)(src->p);
+          int w = (int)width;
+          avx512_deinterleave64(dpl, dph, sp, w);
+        }        
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m512i vb = _mm512_set1_epi64(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else 
+          {
+            // general case
+            // 64bit multiplication is not supported in AVX512F + AVX512CD;
+            // in particular, _mm256_mullo_epi64.
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // This can only be used if you have AVX512DQ
+          // {
+          //   // general case
+          //   __m512i va = _mm512_set1_epi64(a);
+          //   int i = (int)h_width;
+          //   if (even)
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_add_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          //   else
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_add_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          // }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
                              ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        avx512_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        avx512_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                               const line_buf* lsrc, const line_buf* hsrc, 
+                               ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -670,7 +1070,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           __m512i va = _mm512_set1_epi32(a);
           __m512i vb = _mm512_set1_epi32(b);
 
@@ -804,7 +1204,13 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+        {
+          float* dp  = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          avx512_interleave32(dp, spl, sph, w);
+        }          
       }
       else {
         if (even)
@@ -814,5 +1220,206 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                               const line_buf* lsrc, const line_buf* hsrc, 
+                               ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m512i vb = _mm512_set1_epi64(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else 
+           {
+            // general case
+            // 64bit multiplication is not supported in AVX512F + AVX512CD;
+            // in particular, _mm256_mullo_epi64.            
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // This can only be used if you have AVX512DQ
+          // {
+          //   // general case
+          //   __m512i va = _mm512_set1_epi64(a);
+          //   int i = (int)aug_width;
+          //   if (ev)
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_sub_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          //   else
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_sub_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          // }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)(dst->p);
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          avx512_interleave64(dp, spl, sph, w);
+        }          
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        avx512_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        avx512_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }
+
   } // !local
 } // !ojph
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index ec2a2e1..acf9ee6 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -42,7 +42,10 @@
 #include "ojph_defs.h"
 
 namespace ojph {
-  struct line_buf;
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
     struct param_atk;
     union lifting_step;
@@ -104,60 +107,6 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    // Supporting macros
-    //////////////////////////////////////////////////////////////////////////
-
-    //////////////////////////////////////////////////////////////////////////
-    #define SSE_DEINTERLEAVE(dpl, dph, sp, width, even)                      \
-    {                                                                        \
-      if (even)                                                              \
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(sp);                                        \
-          __m128 b = _mm_load_ps(sp + 4);                                    \
-          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
-          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
-          _mm_store_ps(dpl, c);                                              \
-          _mm_store_ps(dph, d);                                              \
-        }                                                                    \
-      else                                                                   \
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(sp);                                        \
-          __m128 b = _mm_load_ps(sp + 4);                                    \
-          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
-          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
-          _mm_store_ps(dpl, d);                                              \
-          _mm_store_ps(dph, c);                                              \
-        }                                                                    \
-    }                                                                        
-
-    //////////////////////////////////////////////////////////////////////////
-    #define SSE_INTERLEAVE(dp, spl, sph, width, even)                        \
-    {                                                                        \
-      if (even)                                                              \
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(spl);                                       \
-          __m128 b = _mm_load_ps(sph);                                       \
-          __m128 c = _mm_unpacklo_ps(a, b);                                  \
-          __m128 d = _mm_unpackhi_ps(a, b);                                  \
-          _mm_store_ps(dp, c);                                               \
-          _mm_store_ps(dp + 4, d);                                           \
-        }                                                                    \
-      else                                                                   \
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(spl);                                       \
-          __m128 b = _mm_load_ps(sph);                                       \
-          __m128 c = _mm_unpacklo_ps(b, a);                                  \
-          __m128 d = _mm_unpackhi_ps(b, a);                                  \
-          _mm_store_ps(dp, c);                                               \
-          _mm_store_ps(dp + 4, d);                                           \
-        }                                                                    \
-    }
-
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
@@ -216,76 +165,6 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    // Supporting macros
-    //////////////////////////////////////////////////////////////////////////
-
-    //////////////////////////////////////////////////////////////////////////
-    #define AVX_DEINTERLEAVE(dpl, dph, sp, width, even)                      \
-    {                                                                        \
-      if (even)                                                              \
-      {                                                                      \
-        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)         \
-        {                                                                    \
-          __m256 a = _mm256_load_ps(sp);                                     \
-          __m256 b = _mm256_load_ps(sp + 8);                                 \
-          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));           \
-          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));           \
-          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));       \
-          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));       \
-          _mm256_store_ps(dpl, e);                                           \
-          _mm256_store_ps(dph, f);                                           \
-        }                                                                    \
-      }                                                                      \
-      else                                                                   \
-      {                                                                      \
-        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)         \
-        {                                                                    \
-          __m256 a = _mm256_load_ps(sp);                                     \
-          __m256 b = _mm256_load_ps(sp + 8);                                 \
-          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));           \
-          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));           \
-          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));       \
-          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));       \
-          _mm256_store_ps(dpl, f);                                           \
-          _mm256_store_ps(dph, e);                                           \
-        }                                                                    \
-      }                                                                      \
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    #define AVX_INTERLEAVE(dp, spl, sph, width, even)                        \
-    {                                                                        \
-      if (even)                                                              \
-      {                                                                      \
-        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)         \
-        {                                                                    \
-          __m256 a = _mm256_load_ps(spl);                                    \
-          __m256 b = _mm256_load_ps(sph);                                    \
-          __m256 c = _mm256_unpacklo_ps(a, b);                               \
-          __m256 d = _mm256_unpackhi_ps(a, b);                               \
-          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));           \
-          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));           \
-          _mm256_store_ps(dp, e);                                            \
-          _mm256_store_ps(dp + 8, f);                                        \
-        }                                                                    \
-      }                                                                      \
-      else                                                                   \
-      {                                                                      \
-        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)         \
-        {                                                                    \
-          __m256 a = _mm256_load_ps(spl);                                    \
-          __m256 b = _mm256_load_ps(sph);                                    \
-          __m256 c = _mm256_unpacklo_ps(b, a);                               \
-          __m256 d = _mm256_unpackhi_ps(b, a);                               \
-          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));           \
-          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));           \
-          _mm256_store_ps(dp, e);                                            \
-          _mm256_store_ps(dp + 8, f);                                        \
-        }                                                                    \
-      }                                                                      \
-    }
-
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp
index 897a193..dcb5e53 100644
--- a/src/core/transform/ojph_transform_sse.cpp
+++ b/src/core/transform/ojph_transform_sse.cpp
@@ -50,6 +50,36 @@
 namespace ojph {
   namespace local {
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m128 a = _mm_load_ps(sp);
+        __m128 b = _mm_load_ps(sp + 4);
+        __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm_store_ps(dpl, c);
+        _mm_store_ps(dph, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse_interleave32(float* dp, float* spl, float* sph, int width)                      \
+    {
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m128 a = _mm_load_ps(spl);
+        __m128 b = _mm_load_ps(sph);
+        __m128 c = _mm_unpacklo_ps(a, b);
+        __m128 d = _mm_unpackhi_ps(a, b);
+        _mm_store_ps(dp, c);
+        _mm_store_ps(dp + 4, d);
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     static inline void sse_multiply_const(float* p, float f, int width)
     {
@@ -100,11 +130,11 @@ namespace ojph {
       {
         // split src into ldst and hdst
         {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
           float* sp = src->f32;
           int w = (int)width;
-          SSE_DEINTERLEAVE(dpl, dph, sp, w, even);
+          sse_deinterleave32(dpl, dph, sp, w);
         }
 
         // the actual horizontal transform
@@ -235,10 +265,10 @@ namespace ojph {
         // combine both lsrc and hsrc into dst
         {
           float* dp = dst->f32;
-          float* spl = lsrc->f32;
-          float* sph = hsrc->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
           int w = (int)width;
-          SSE_INTERLEAVE(dp, spl, sph, w, even);
+          sse_interleave32(dp, spl, sph, w);
         }
       }
       else {
diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp
index 8328842..a69b1fb 100644
--- a/src/core/transform/ojph_transform_sse2.cpp
+++ b/src/core/transform/ojph_transform_sse2.cpp
@@ -35,6 +35,7 @@
 // Date: 28 August 2019
 //***************************************************************************/
 
+#include <climits>
 #include <cstdio>
 
 #include "ojph_defs.h"
@@ -52,13 +53,86 @@ namespace ojph {
   namespace local {
 
     /////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                            const line_buf* other, const line_buf* aug, 
-                            ui32 repeat, bool synthesis)
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) 
+    {
+      // note than m must be obtained using
+      // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
+      __m128i x = _mm_srli_epi64(a, amt);
+      x = _mm_xor_si128(x, m);
+      __m128i result = _mm_sub_epi64(x, m);
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse2_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m128 a = _mm_load_ps(sp);
+        __m128 b = _mm_load_ps(sp + 4);
+        __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm_store_ps(dpl, c);
+        _mm_store_ps(dph, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse2_interleave32(float* dp, float* spl, float* sph, int width)                      \
+    {
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m128 a = _mm_load_ps(spl);
+        __m128 b = _mm_load_ps(sph);
+        __m128 c = _mm_unpacklo_ps(a, b);
+        __m128 d = _mm_unpackhi_ps(a, b);
+        _mm_store_ps(dp, c);
+        _mm_store_ps(dp + 4, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void sse2_deinterleave64(double* dpl, double* dph, double* sp, int width)
+    {
+      for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
+      {
+        __m128d a = _mm_load_pd(sp);
+        __m128d b = _mm_load_pd(sp + 2);
+        __m128d c = _mm_shuffle_pd(a, b, 0);
+        __m128d d = _mm_shuffle_pd(a, b, 3);
+        _mm_store_pd(dpl, c);
+        _mm_store_pd(dph, d);
+      }
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void sse2_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
+      {
+        __m128d a = _mm_load_pd(spl);
+        __m128d b = _mm_load_pd(sph);
+        __m128d c = _mm_unpacklo_pd(a, b);
+        __m128d d = _mm_unpackhi_pd(a, b);
+        _mm_store_pd(dp, c);
+        _mm_store_pd(dp + 2, d);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const si32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
       __m128i vb = _mm_set1_epi32(b);
 
       si32* dst = aug->i32;
@@ -162,19 +236,153 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                           const line_buf* hdst, const line_buf* src, 
-                           ui32 width, bool even)
+    static
+    void sse2_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si64 a = s->rev.Aatk;
+      const si64 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m128i vb = _mm_set1_epi64x(b);
+      __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_add_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_sub_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_add_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_add_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i w = sse2_mm_srai_epi64(t, e, ve);
+            d = _mm_add_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i w = sse2_mm_srai_epi64(t, e, ve);
+            d = _mm_sub_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_sub_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_sub_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_sub_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_add_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else { // general case
+        // 64bit multiplication is not supported in sse2
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        // combine both lsrc and hsrc into dst
+        // split src into ldst and hdst
         {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
           float* sp = src->f32;
           int w = (int)width;
-          SSE_DEINTERLEAVE(dpl, dph, sp, w, even);
+          sse2_deinterleave32(dpl, dph, sp, w);
         }
 
         si32* hp = hdst->i32, * lp = ldst->i32;
@@ -187,7 +395,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           __m128i vb = _mm_set1_epi32(b);
 
           // extension
@@ -284,9 +492,7 @@ namespace ojph {
           }
           else {
             // general case
-            // 32bit multiplication is not supported in sse2; we need sse4.1,
-            // where we can use _mm_mullo_epi32, which multiplies
-            // 32bit x 32bit, keeping the LSBs
+            // 64bit multiplication is not supported in sse2
             if (even)
               for (ui32 i = h_width; i > 0; --i, sp++, dp++)
                 *dp += (b + a * (sp[0] + sp[1])) >> e;
@@ -308,11 +514,179 @@ namespace ojph {
           hdst->i32[0] = src->i32[0] << 1;
       }
     }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          sse2_deinterleave64(dpl, dph, sp, w);
+        }
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m128i vb = _mm_set1_epi64x(b);
+          __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in sse2
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    }    
     
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                           const line_buf* lsrc, const line_buf* hsrc, 
-                           ui32 width, bool even)
+    void sse2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -326,7 +700,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           __m128i vb = _mm_set1_epi32(b);
 
           // extension
@@ -443,10 +817,10 @@ namespace ojph {
         // combine both lsrc and hsrc into dst
         {
           float* dp = dst->f32;
-          float* spl = lsrc->f32;
-          float* sph = hsrc->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
           int w = (int)width;
-          SSE_INTERLEAVE(dp, spl, sph, w, even);
+          sse2_interleave32(dp, spl, sph, w);
         }
       }
       else {
@@ -457,5 +831,172 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m128i vb = _mm_set1_epi64x(b);
+          __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in sse2
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          sse2_interleave64(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }    
+
   } // !local
 } // !ojph
diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp
index bd652df..341cfc3 100644
--- a/src/core/transform/ojph_transform_wasm.cpp
+++ b/src/core/transform/ojph_transform_wasm.cpp
@@ -51,65 +51,69 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_deinterleave(float* dpl, float* dph, float* sp, 
-                           int width, bool even)
+    static inline
+    void wasm_deinterleave32(float* dpl, float* dph, float* sp, int width)
     {
-      if (even)
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
-        {
-          v128_t a = wasm_v128_load(sp);
-          v128_t b = wasm_v128_load(sp + 4);
-          v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
-          v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
-          // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
-          // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-          wasm_v128_store(dpl, c);
-          wasm_v128_store(dph, d);
-        }
-      else
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
-        {
-          v128_t a = wasm_v128_load(sp);
-          v128_t b = wasm_v128_load(sp + 4);
-          v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
-          v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
-          // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
-          // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-          wasm_v128_store(dpl, d);
-          wasm_v128_store(dph, c);
-        }
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        v128_t a = wasm_v128_load(sp);
+        v128_t b = wasm_v128_load(sp + 4);
+        v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
+        v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
+        // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        wasm_v128_store(dpl, c);
+        wasm_v128_store(dph, d);
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_interleave(float* dp, float* spl, float* sph, 
-                         int width, bool even)
+    static inline
+    void wasm_interleave32(float* dp, float* spl, float* sph, int width)
     {
-      if (even)
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
-        {
-          v128_t a = wasm_v128_load(spl);
-          v128_t b = wasm_v128_load(sph);
-          v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
-          v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
-          // v128_t c = _mm_unpacklo_ps(a, b);
-          // v128_t d = _mm_unpackhi_ps(a, b);
-          wasm_v128_store(dp, c);
-          wasm_v128_store(dp + 4, d);
-        }
-      else
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
-        {
-          v128_t a = wasm_v128_load(spl);
-          v128_t b = wasm_v128_load(sph);
-          v128_t c = wasm_i32x4_shuffle(b, a, 0, 4 + 0, 1, 4 + 1);
-          v128_t d = wasm_i32x4_shuffle(b, a, 2, 4 + 2, 3, 4 + 3);
-          // v128_t c = _mm_unpacklo_ps(b, a);
-          // v128_t d = _mm_unpackhi_ps(b, a);
-          wasm_v128_store(dp, c);
-          wasm_v128_store(dp + 4, d);
-        }
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        v128_t a = wasm_v128_load(spl);
+        v128_t b = wasm_v128_load(sph);
+        v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
+        v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
+        // v128_t c = _mm_unpacklo_ps(a, b);
+        // v128_t d = _mm_unpackhi_ps(a, b);
+        wasm_v128_store(dp, c);
+        wasm_v128_store(dp + 4, d);
+      }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void wasm_deinterleave64(double* dpl, double* dph, double* sp, int width)
+    {
+      for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
+      {
+        v128_t a = wasm_v128_load(sp);
+        v128_t b = wasm_v128_load(sp + 2);
+        v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
+        v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
+        wasm_v128_store(dpl, c);
+        wasm_v128_store(dph, d);
+      }
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void wasm_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
+      {
+        v128_t a = wasm_v128_load(spl);
+        v128_t b = wasm_v128_load(sph);
+        v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
+        v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
+        wasm_v128_store(dp, c);
+        wasm_v128_store(dp + 2, d);
+      }
+    }    
+
     //////////////////////////////////////////////////////////////////////////
     static inline void wasm_multiply_const(float* p, float f, int width)
     {
@@ -159,7 +163,13 @@ namespace ojph {
       if (width > 1)
       {
         // split src into ldst and hdst
-        wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          wasm_deinterleave32(dpl, dph, sp, w);
+        }        
 
         // the actual horizontal transform
         float* hp = hdst->f32, * lp = ldst->f32;
@@ -287,7 +297,13 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+        {
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          wasm_interleave32(dp, spl, sph, w);
+        }        
       }
       else {
         if (even)
@@ -298,13 +314,13 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                            const line_buf* other, const line_buf* aug, 
-                            ui32 repeat, bool synthesis)
+    void wasm_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const ui32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
       v128_t va = wasm_i32x4_splat(a);
       v128_t vb = wasm_i32x4_splat(b);
 
@@ -428,14 +444,174 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                           const line_buf* hdst, const line_buf* src, 
-                           ui32 width, bool even)
+    void wasm_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      v128_t va = wasm_i64x2_splat(a);
+      v128_t vb = wasm_i64x2_splat(b);
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_add(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_add(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t w = wasm_i64x2_shr(t, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t w = wasm_i64x2_shr(t, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_sub(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_sub(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else 
+      { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t u = wasm_i64x2_mul(va, t);
+            v128_t v = wasm_i64x2_add(vb, u);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t u = wasm_i64x2_mul(va, t);
+            v128_t v = wasm_i64x2_add(vb, u);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        wasm_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        wasm_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void wasm_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
         // combine both lsrc and hsrc into dst
-        wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);          
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          wasm_deinterleave32(dpl, dph, sp, w);
+        }        
 
         si32* hp = hdst->i32, * lp = ldst->i32;
         ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
@@ -447,7 +623,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           v128_t va = wasm_i32x4_splat(a);
           v128_t vb = wasm_i32x4_splat(b);
 
@@ -587,11 +763,199 @@ namespace ojph {
           hdst->i32[0] = src->i32[0] << 1;
       }
     }
-    
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                           const line_buf* lsrc, const line_buf* hsrc, 
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void wasm_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // combine both lsrc and hsrc into dst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          wasm_deinterleave64(dpl, dph, sp, w);
+        }        
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = wasm_i64x2_splat(a);
+          v128_t vb = wasm_i64x2_splat(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);                
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
                            ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        wasm_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        wasm_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -605,7 +969,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           v128_t va = wasm_i32x4_splat(a);
           v128_t vb = wasm_i32x4_splat(b);
 
@@ -739,7 +1103,13 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+        {
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          wasm_interleave32(dp, spl, sph, w);
+        }
       }
       else {
         if (even)
@@ -749,5 +1119,192 @@ namespace ojph {
       }
     }
     
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = wasm_i64x2_splat(a);
+          v128_t vb = wasm_i64x2_splat(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);                
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          wasm_interleave64(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        wasm_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        wasm_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    } 
+
   } // !local
 } // !ojph
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 000409f..8cc1d72 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -3,7 +3,7 @@
 include(FetchContent)
 FetchContent_Declare(
   googletest
-  URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz
+  URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz
   EXCLUDE_FROM_ALL
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp
index 9f77f75..22f148e 100644
--- a/tests/test_executables.cpp
+++ b/tests/test_executables.cpp
@@ -107,8 +107,27 @@ int execute(const std::string& cmd, std::string& result)
 #define REF_FILE_DIR "./jp2k_test_codestreams/openjph/references/"
 #define MSE_PAE_PATH  "./mse_pae"
 #define COMPARE_FILES_PATH  "./compare_files"
+
+// This is a comment to me, to help with emscripten testing.
+// This is written after the completion of the tests.
+// 1. Compile for the target platform (Linux), selecting from the following
+//    code the version that suits you; in particular it should be the one
+//    the uses node.  Ideally create two versions of test_executables, one
+//    for WASM SIMD, and for WASM without SIMD -- use linux cp command to
+//    create test_executables_simd and test_executables_no_simd
+// 2. Compile again, without deleting what compiled; this time compile using
+//    emscripten, targeting WASM.  The compilation is very finicky, do
+//    'make clean && make' after every change in code.
+// 3. cd to tests, and run test_executables_simd or test_executables_no_simd.
+
 #define EXPAND_EXECUTABLE "./ojph_expand"
 #define COMPRESS_EXECUTABLE "./ojph_compress"
+//#define EXPAND_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_expand.js"
+//#define COMPRESS_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_compress.js"
+//#define EXPAND_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_expand_simd.js"
+//#define COMPRESS_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_compress_simd.js"
+//#define EXPAND_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_expand"
+//#define COMPRESS_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_compress"
 #endif
 #define TOL_DOUBLE 0.01
 #define TOL_INTEGER 1