diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index df021b0..94ca3eb 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -1,11 +1,16 @@ # taken from https://github.com/onqtam/doctest/blob/master/.github/workflows/main.yml name: C/C++ CI -on: push +on: + push: + pull_request: + types: [opened, reopened] + jobs: build: strategy: + fail-fast: false matrix: include: [ { system: MacOS, runner: macos-latest }, @@ -25,6 +30,7 @@ jobs: build_windows: strategy: + fail-fast: false matrix: include: [ { system: Windows, runner: windows-latest }, @@ -42,9 +48,11 @@ jobs: test: strategy: + fail-fast: false matrix: include: [ - { system: MacOS, runner: macos-latest }, + { system: MacOS-13, runner: macos-13 }, + { system: MacOS-latest, runner: macos-latest }, { system: Ubuntu-latest, runner: ubuntu-latest }, ] name: ${{ matrix.system }} Test @@ -63,6 +71,7 @@ jobs: test_windows: strategy: + fail-fast: false matrix: include: [ { system: Windows, runner: windows-latest }, diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index c2d527a..7d031b5 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -46,11 +46,11 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -64,7 +64,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 # ℹī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -77,6 +77,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{matrix.language}}" diff --git a/README.md b/README.md index 15d6597..90064a7 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Open source implementation of High-throughput JPEG2000 (HTJ2K), also known as JPH, JPEG2000 Part 15, ISO/IEC 15444-15, and ITU-T T.814. Here, we are interested in implementing the HTJ2K only, supporting features that are defined in JPEG2000 Part 1 (for example, for wavelet transform, only reversible 5/3 and irreversible 9/7 are supported). -The interested reader is referred to the [short HTJ2K white paper](http://ds.jpeg.org/whitepapers/jpeg-htj2k-whitepaper.pdf), or the [extended HTJ2K white paper](https://htj2k.com/wp-content/uploads/white-paper.pdf) for more details on HTJ2K. [This](https://kakadusoftware.com/wp-content/uploads/icip2019.pdf) paper explores the attainable performance on CPU, while [this](https://kakadusoftware.com/wp-content/uploads/ICIP2019_GPU.pdf) and [this](https://webapps.unsworks.library.unsw.edu.au/fapi/datastream/unsworks:75139/bin990339e4-8805-4456-ae30-223d85f9b1c1) explores performance on the GPU. +The interested reader is referred to the [short HTJ2K white paper](http://ds.jpeg.org/whitepapers/jpeg-htj2k-whitepaper.pdf), or the [extended HTJ2K white paper](https://htj2k.com/wp-content/uploads/white-paper.pdf) for more details on HTJ2K. [This](https://kakadusoftware.com/wp-content/uploads/icip2019.pdf) paper explores the attainable performance on CPU, while [this](https://kakadusoftware.com/wp-content/uploads/ICIP2019_GPU.pdf) and [this](http://hdl.handle.net/1959.4/unsworks_75139) explores performance on the GPU. # The standard # @@ -17,4 +17,8 @@ The standard is available free of charge from [ITU website](https://www.itu.int/ * [Compiling and Running in Docker](./docs/docker.md) * [Usage Example](./docs/usage_examples.md) * [Web-based Demos](./docs/web_demos.md) -* [Doxygen Documentation Style](./docs/doxygen_style.md) \ No newline at end of file +* [Doxygen Documentation Style](./docs/doxygen_style.md) + +# Repositories # +[![Packaging status](https://repology.org/badge/vertical-allrepos/openjph.svg)](https://repology.org/project/openjph/versions) + diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h index 8e41493..c18ee76 100644 --- a/src/apps/common/ojph_img_io.h +++ b/src/apps/common/ojph_img_io.h @@ -54,7 +54,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// // defined elsewhere class mem_fixed_allocator; - struct line_buf; + class line_buf; //////////////////////////////////////////////////////////////////////////// // @@ -135,7 +135,7 @@ namespace ojph { ui32 cur_line; si64 start_of_data; - int planar; + bool planar; ui32 bit_depth[3]; bool is_signed[3]; point subsampling[3]; @@ -446,6 +446,68 @@ namespace ojph { size_t buffer_size; }; + //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + class pfm_in : public image_in_base + { + public: + pfm_in(mem_fixed_allocator *p = NULL) + { + fh = 0; + fname = NULL; + alloc_p = p; + temp_buf = NULL; + temp_buf_byte_size = 0; + bit_depth[0] = bit_depth[1] = bit_depth[2] = 32; + scale = 0.0f; + little_endian = true; + width = height = num_comps = 0; + + cur_line = 0; + start_of_data = 0; + } + virtual ~pfm_in() + { + close(); + if (alloc_p == NULL && temp_buf) + free(temp_buf); + } + + void open(const char* filename); + void finalize_alloc(); + void configure(ui32* bit_depth) { + assert(num_comps != 0); + for (ui32 c = 0; c < num_comps; ++c) + this->bit_depth[c] = bit_depth[c]; + } + virtual ui32 read(const line_buf* line, ui32 comp_num); + void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; } + + size get_size() { assert(fh); return size(width, height); } + ui32 get_width() { assert(fh); return width; } + ui32 get_height() { assert(fh); return height; } + ui32 get_num_components() { assert(fh); return num_comps; } + + private: + FILE *fh; + const char *fname; + mem_fixed_allocator *alloc_p; + float *temp_buf; + size_t temp_buf_byte_size; + ui32 bit_depth[3]; // this truncates data to bit_depth in the LSB + float scale; + bool little_endian; + ui32 width, height, num_comps; + ui32 cur_line; + si64 start_of_data; + }; + + //////////////////////////////////////////////////////////////////////////// // Accelerators (defined in ojph_img_io_*) typedef void (*conversion_fun)(const line_buf *ln0, const line_buf *ln1, @@ -559,7 +621,7 @@ namespace ojph { ui32 width, height, num_components; ui32 bit_depth, bytes_per_sample; ui8* buffer; - ui32 buffer_size; + size_t buffer_size; ui32 cur_line, samples_per_line, bytes_per_line; conversion_fun converter; const line_buf *lptr[3]; @@ -621,7 +683,7 @@ namespace ojph { ui32 bit_depth_of_data[4]; ui32 bytes_per_sample; ui8* buffer; - ui32 buffer_size; + size_t buffer_size; ui32 cur_line, samples_per_line; }; #endif /* OJPH_ENABLE_TIFF_SUPPORT */ @@ -698,11 +760,60 @@ namespace ojph { const char* fname; bool is_signed; ui32 bit_depth, bytes_per_sample; - si32 lower_val, upper_val; + si64 lower_val, upper_val; ui32 width; ui8* buffer; ui32 buffer_size; }; + + //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + class pfm_out : public image_out_base + { + public: + pfm_out() + { + fh = NULL; + fname = NULL; + buffer = NULL; + buffer_size = 0; + width = height = num_components = 0; + scale = -1.0f; + bit_depth[0] = bit_depth[1] = bit_depth[2] = 32; + cur_line = 0; + start_of_data = 0; + } + virtual ~pfm_out() + { + close(); + if (buffer) + free(buffer); + } + + void open(char* filename); + void configure(ui32 width, ui32 height, ui32 num_components, + float scale, ui32* bit_depth); + virtual ui32 write(const line_buf* line, ui32 comp_num); + virtual void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; } + + private: + FILE *fh; + const char *fname; + float* buffer; + size_t buffer_size; + ui32 width, height, num_components; + float scale; + ui32 bit_depth[3]; + ui32 cur_line; + si64 start_of_data; + }; + + } #endif // !OJPH_IMG_IO_H diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index 0c4aa0e..144c837 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -526,9 +526,9 @@ int main(int argc, char * argv[]) { std::cout << "\nThe following arguments are necessary:\n" #ifdef OJPH_ENABLE_TIFF_SUPPORT - " -i input file name (either pgm, ppm, tif(f), or raw(yuv))\n" + " -i input file name (either pgm, ppm, pfm, tif(f), or raw(yuv))\n" #else - " -i input file name (either pgm, ppm, or raw(yuv))\n" + " -i input file name (either pgm, ppm, pfm, or raw(yuv))\n" #endif // !OJPH_ENABLE_TIFF_SUPPORT " -o output file name\n\n" @@ -587,7 +587,33 @@ int main(int argc, char * argv[]) { " component; for example: 12,10,10\n" " -downsamp {x,y},{x,y},...,{x,y} a list of x,y points, one for each\n" " component; for example {1,1},{2,2},{2,2}\n\n" - ; + "\n" + + ".pfm files receive special treatment. Currently, lossy compression\n" + "with these files is not supported, only lossless. When these files are\n" + "used, the NLT segment marker is automatically inserted into the\n" + "codestream when needed, as explained shortly. The following arguments\n" + "can be useful for this file type.\n" + " -signed a comma-separated list of true or false parameters, one\n" + " for each component; for example: true,false,false.\n" + " If you are sure that all sample values are positive or 0,\n" + " set the corresponding entry to false; otherwise set it to\n" + " true.\n" + " When a component entry is set to true, an NLT segment\n" + " marker segment is inserted into the codestream.\n" + " The NLT segment specifies a non-linear transform that\n" + " changes only negative values, producing better coding\n" + " efficiency.\n" + " The NLT segment marker might be less supported in other\n" + " encoders.\n" + " -bit_depth a comma-separated list of bit depth values, one per \n" + " component; for example: 12,10,10.\n" + " Floating value numbers are treated as integers, and they\n" + " are shifted to the right, keeping only the specified\n" + " number of bits. Up to 32 bits (which is the default) are\n" + " supported.\n" + + "\n"; return -1; } if (!get_arguments(argc, argv, input_filename, output_filename, @@ -611,6 +637,7 @@ int main(int argc, char * argv[]) { ojph::codestream codestream; ojph::ppm_in ppm; + ojph::pfm_in pfm; ojph::yuv_in yuv; ojph::raw_in raw; ojph::dpx_in dpx; @@ -736,6 +763,106 @@ int main(int argc, char * argv[]) { base = &ppm; } + else if (is_matching(".pfm", v)) + { + pfm.open(input_filename); + ojph::param_siz siz = codestream.access_siz(); + siz.set_image_extent(ojph::point(image_offset.x + pfm.get_width(), + image_offset.y + pfm.get_height())); + ojph::ui32 num_comps = pfm.get_num_components(); + assert(num_comps == 1 || num_comps == 3); + siz.set_num_components(num_comps); + + if (bit_depth[0] != 0) // one was set + if (num_bit_depths < num_comps) // but if not enough, repeat + for (ojph::ui32 c = num_bit_depths; c < num_comps; ++c) + bit_depth[c] = bit_depth[num_bit_depths - 1]; + if (is_signed[0] != -1) // one was set + if (num_is_signed < num_comps) // but if not enough, repeat + for (ojph::ui32 c = num_is_signed; c < num_comps; ++c) + is_signed[c] = is_signed[num_is_signed - 1]; + + bool all_the_same = true; + if (num_comps == 3) + { + all_the_same = all_the_same + && bit_depth[0] == bit_depth[1] + && bit_depth[1] == bit_depth[2]; + all_the_same = all_the_same + && is_signed[0] == is_signed[1] + && is_signed[1] == is_signed[2]; + } + + pfm.configure(bit_depth); + ojph::point ds(1, 1); + for (ojph::ui32 c = 0; c < num_comps; ++c) { + ojph::ui32 bd = 32; + if (bit_depth[c] != 0) + bd = bit_depth[c]; + bool is = true; + if (is_signed[c] != -1) + is = is_signed[c] != 0; + siz.set_component(c, ds, bd, is); + } + siz.set_image_offset(image_offset); + siz.set_tile_size(tile_size); + siz.set_tile_offset(tile_offset); + + ojph::param_cod cod = codestream.access_cod(); + cod.set_num_decomposition(num_decompositions); + cod.set_block_dims(block_size.w, block_size.h); + if (num_precincts != -1) + cod.set_precinct_size(num_precincts, precinct_size); + cod.set_progression_order(prog_order); + if (num_comps == 1) + { + if (employ_color_transform != -1) + OJPH_WARN(0x01000092, + "-colour_trans option is not needed and was not used; " + "this is because the image has one component only\n"); + } + else + { + if (employ_color_transform == -1) + cod.set_color_transform(true); + else + cod.set_color_transform(employ_color_transform == 1); + } + cod.set_reversible(reversible); + if (!reversible && quantization_step != -1.0f) + codestream.access_qcd().set_irrev_quant(quantization_step); + + ojph::param_nlt nlt = codestream.access_nlt(); + if (reversible) { + if (all_the_same) + nlt.set_type3_transformation(ojph::param_nlt::ALL_COMPS, true); + else + for (ojph::ui32 c = 0; c < num_comps; ++c) + nlt.set_type3_transformation(c, true); + } + else + OJPH_ERROR(0x01000093, "We currently support lossless only for " + "pfm images; this may change in the future."); + + codestream.set_planar(false); + if (profile_string[0] != '\0') + codestream.set_profile(profile_string); + codestream.set_tilepart_divisions(tileparts_at_resolutions, + tileparts_at_components); + codestream.request_tlm_marker(tlm_marker); + + if (dims.w != 0 || dims.h != 0) + OJPH_WARN(0x01000094, + "-dims option is not needed and was not used\n"); + if (num_components != 0) + OJPH_WARN(0x01000095, + "-num_comps is not needed and was not used\n"); + if (comp_downsampling[0].x != 0 || comp_downsampling[0].y != 0) + OJPH_WARN(0x01000096, + "-downsamp is not needed and was not used\n"); + + base = &pfm; + } #ifdef OJPH_ENABLE_TIFF_SUPPORT else if (is_matching(".tif", v) || is_matching(".tiff", v)) { diff --git a/src/apps/ojph_expand/ojph_expand.cpp b/src/apps/ojph_expand/ojph_expand.cpp index 7d6f3d5..3d3b981 100644 --- a/src/apps/ojph_expand/ojph_expand.cpp +++ b/src/apps/ojph_expand/ojph_expand.cpp @@ -213,6 +213,7 @@ int main(int argc, char *argv[]) { ojph::codestream codestream; ojph::ppm_out ppm; + ojph::pfm_out pfm; #ifdef OJPH_ENABLE_TIFF_SUPPORT ojph::tif_out tif; #endif /* OJPH_ENABLE_TIFF_SUPPORT */ @@ -266,6 +267,59 @@ int main(int argc, char *argv[]) { ppm.open(output_filename); base = &ppm; } + else if (is_matching(".pfm", v)) + { + codestream.set_planar(false); + ojph::param_siz siz = codestream.access_siz(); + ojph::param_cod cod = codestream.access_cod(); + ojph::param_nlt nlt = codestream.access_nlt(); + + ojph::ui32 num_comps = siz.get_num_components(); + if (num_comps != 3 && num_comps != 1) + OJPH_ERROR(0x0200000C, + "The file has %d color components; this cannot be saved to" + " a .pfm file\n", num_comps); + bool all_same = true; + ojph::point p = siz.get_downsampling(0); + for (ojph::ui32 i = 1; i < siz.get_num_components(); ++i) + { + ojph::point p1 = siz.get_downsampling(i); + all_same = all_same && (p1.x == p.x) && (p1.y == p.y); + } + if (!all_same) + OJPH_ERROR(0x0200000D, + "To save an image to ppm, all the components must have the " + "same downsampling ratio\n"); + ojph::ui32 bit_depth[3]; + for (ojph::ui32 c = 0; c < siz.get_num_components(); ++c) { + ojph::ui8 bd = 0; + bool is = true; + bool result = nlt.get_type3_transformation(c, bd, is); + if (result == false) + OJPH_ERROR(0x0200000E, + "This codestream is not supported; it does not have an " + "NLT segment marker for this component (or no default NLT " + "settings) .\n"); + if (bd != siz.get_bit_depth(c) || is != siz.is_signed(c)) + OJPH_ERROR(0x0200000F, + "There is discrepancy in component %d configuration between " + "SIZ marker segment, which specifies bit_depth = %d and " + "signedness = %s, and NLT marker segment, which specifies " + "bit_depth = %d and signedness = %s.\n", c, + siz.get_bit_depth(c), is != siz.is_signed(c) ? "True" : "False", + bd, is ? "True" : "False"); + bit_depth[c] = bd; + } + if (!cod.is_reversible()) + OJPH_ERROR(0x02000010, + "This codestream is lossy (not reversible), and we currently " + "only support reversible codestreams for .pfm target files. " + "This is only temporary and will be changed at some point.\n"); + pfm.configure(siz.get_recon_width(0), siz.get_recon_height(0), + siz.get_num_components(), -1.0f, bit_depth); + pfm.open(output_filename); + base = &pfm; + } #ifdef OJPH_ENABLE_TIFF_SUPPORT else if (is_matching(".tif", v) || is_matching(".tiff", v)) { diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index 82bbe10..89b8127 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -247,7 +247,7 @@ namespace ojph { assert(fh == 0); fh = fopen(filename, "rb"); if (fh == 0) - OJPH_ERROR(0x030000001, "Unable to open file %s", filename); + OJPH_ERROR(0x03000001, "Unable to open file %s", filename); fname = filename; // read magic number @@ -255,27 +255,27 @@ namespace ojph { if (fread(t, 1, 2, fh) != 2) { close(); - OJPH_ERROR(0x030000002, "Error reading file %s", filename); + OJPH_ERROR(0x03000002, "Error reading file %s", filename); } // check magic number if (t[0] != 'P' || (t[1] != '5' && t[1] != '6')) { close(); - OJPH_ERROR(0x030000003, "unknown file type for file %s", filename); + OJPH_ERROR(0x03000003, "unknown file type for file %s", filename); } size_t len = strlen(filename); if (t[1] == '5' && strncmp(filename + len - 4, ".pgm", 4) != 0) { close(); - OJPH_ERROR(0x030000004, "wrong file extension, a file with " + OJPH_ERROR(0x03000004, "wrong file extension, a file with " "keyword P5 must have a .pgm extension for file %s", filename); } if (t[1] == '6' && strncmp(filename + len - 4, ".ppm", 4) != 0) { close(); - OJPH_ERROR(0x030000005, "wrong file extension, a file with keyword P6 " + OJPH_ERROR(0x03000005, "wrong file extension, a file with keyword P6 " "must have a .ppm extension for file %s", filename); } @@ -287,7 +287,7 @@ namespace ojph { if (fscanf(fh, "%d %d %d", &width, &height, &max_val) != 3) { close(); - OJPH_ERROR(0x030000006, "error in file format for file %s", filename); + OJPH_ERROR(0x03000006, "error in file format for file %s", filename); } num_ele_per_line = num_comps * width; bytes_per_sample = max_val > 255 ? 2 : 1; @@ -309,7 +309,7 @@ namespace ojph { temp_buf = malloc(temp_buf_byte_size); if (temp_buf == NULL) { // failed to allocate memory if (t) free(t); // the original buffer is still valid - OJPH_ERROR(0x030000007, "error allocating memory"); + OJPH_ERROR(0x03000007, "error allocating memory"); } } else @@ -329,9 +329,9 @@ namespace ojph { return; if (bytes_per_sample == 1) - temp_buf = alloc_p->post_alloc_data(num_comps * width, 0); + temp_buf = alloc_p->post_alloc_data(num_comps * (size_t)width, 0); else - temp_buf = alloc_p->post_alloc_data(num_comps * width, 0); + temp_buf = alloc_p->post_alloc_data(num_comps * (size_t)width, 0); } ///////////////////////////////////////////////////////////////////////////// @@ -347,7 +347,7 @@ namespace ojph { if (result != num_ele_per_line) { close(); - OJPH_ERROR(0x030000011, "not enough data in file %s", fname); + OJPH_ERROR(0x03000011, "not enough data in file %s", fname); } if (++cur_line >= height) { @@ -394,21 +394,21 @@ namespace ojph { if (strncmp(".ppm", filename + len - 4, 4) == 0) { filename[len - 2] = 'g'; - OJPH_WARN(0x03000001, "file was renamed %s\n", filename); + OJPH_WARN(0x03000021, "file was renamed %s\n", filename); } if (strncmp(".PPM", filename + len - 4, 4) == 0) { filename[len - 2] = 'G'; - OJPH_WARN(0x03000002, "file was renamed %s\n", filename); + OJPH_WARN(0x03000022, "file was renamed %s\n", filename); } } fh = fopen(filename, "wb"); if (fh == NULL) - OJPH_ERROR(0x030000021, + OJPH_ERROR(0x03000023, "unable to open file %s for writing", filename); fprintf(fh, "P5\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1); - buffer_size = width * bytes_per_sample; + buffer_size = (size_t)width * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } else @@ -419,23 +419,23 @@ namespace ojph { if (strncmp(".pgm", filename + len - 4, 4) == 0) { filename[len - 2] = 'p'; - OJPH_WARN(0x03000003, "file was renamed %s\n", filename); + OJPH_WARN(0x03000024, "file was renamed %s\n", filename); } if (strncmp(".PGM", filename + len - 4, 4) == 0) { filename[len - 2] = 'P'; - OJPH_WARN(0x03000004, "file was renamed %s\n", filename); + OJPH_WARN(0x03000025, "file was renamed %s\n", filename); } } fh = fopen(filename, "wb"); if (fh == NULL) - OJPH_ERROR(0x030000022, + OJPH_ERROR(0x03000026, "unable to open file %s for writing", filename); int result = //the number of written characters fprintf(fh, "P6\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1); if (result == 0) - OJPH_ERROR(0x030000023, "error writing to file %s", filename); - buffer_size = width * num_components * bytes_per_sample; + OJPH_ERROR(0x03000027, "error writing to file %s", filename); + buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } fname = filename; @@ -448,7 +448,7 @@ namespace ojph { { assert(fh == NULL); //configure before opening if (num_components != 1 && num_components != 3) - OJPH_ERROR(0x030000031, + OJPH_ERROR(0x03000031, "ppm supports 3 colour components, while pgm supports 1"); this->width = width; this->height = height; @@ -530,12 +530,257 @@ namespace ojph { size_t result = fwrite(buffer, bytes_per_sample, samples_per_line, fh); if (result != samples_per_line) - OJPH_ERROR(0x030000042, "error writing to file %s", fname); + OJPH_ERROR(0x03000041, "error writing to file %s", fname); } return 0; } //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + + ///////////////////////////////////////////////////////////////////////////// + void pfm_in::open(const char *filename) + { + assert(fh == 0); + fh = fopen(filename, "rb"); + if (fh == 0) + OJPH_ERROR(0x03000051, "Unable to open file %s", filename); + fname = filename; + + // read magic number + char t[2]; + if (fread(t, 1, 2, fh) != 2) + { + close(); + OJPH_ERROR(0x03000052, "Error reading file %s", filename); + } + + // check magic number + if (t[0] != 'P' || (t[1] != 'F' && t[1] != 'f')) + { + close(); + OJPH_ERROR(0x03000053, "Unknown file type for file %s", filename); + } + + // set number of components based on file-type + num_comps = t[1] == 'f' ? 1 : 3; + eat_white_spaces(fh); + + // read width, height and max value in header + if (fscanf(fh, "%d %d", &width, &height) != 2) + { + close(); + OJPH_ERROR(0x03000054, + "Error reading width and height in file %s", filename); + } + eat_white_spaces(fh); + + // little or big-endian + if (fscanf(fh, "%f", &scale) != 1) + { + close(); + OJPH_ERROR(0x03000055, "Error reading scale in file %s", filename); + } + little_endian = scale < 0.0f; + scale = std::abs(scale); + + fgetc(fh); + start_of_data = ojph_ftell(fh); + + // alloc. linebuffer to hold a line of image data, if more than 1 comp. + if (temp_buf_byte_size < num_comps * (size_t)width * sizeof(float)) + { + if (alloc_p == NULL) + { + temp_buf_byte_size = num_comps * (size_t)width * sizeof(float); + void* t = temp_buf; + if (temp_buf) + temp_buf = (float*)realloc(temp_buf, temp_buf_byte_size); + else + temp_buf = (float*)malloc(temp_buf_byte_size); + if (temp_buf == NULL) { // failed to allocate memory + if (t) free(t); // the original buffer is still valid + OJPH_ERROR(0x03000056, "Error allocating memory"); + } + } + else + { + assert(temp_buf_byte_size == 0); //cannot reallocate the buffer + temp_buf_byte_size = num_comps * (size_t)width * sizeof(float); + alloc_p->pre_alloc_data(temp_buf_byte_size, 0); + } + } + cur_line = 0; + } + + ///////////////////////////////////////////////////////////////////////////// + void pfm_in::finalize_alloc() + { + if (alloc_p == NULL) + return; + temp_buf = alloc_p->post_alloc_data(num_comps * (size_t)width, 0); + } + + ///////////////////////////////////////////////////////////////////////////// + ui32 pfm_in::read(const line_buf* line, ui32 comp_num) + { + assert(temp_buf_byte_size != 0 ); + assert(fh != 0 && comp_num < num_comps); + assert(line->size >= width); + + if (comp_num == 0) + { + si64 loc = start_of_data; + loc += (size_t)(height-1 - cur_line) * (size_t)num_comps + * (size_t)width * sizeof(float); + if (ojph_fseek(fh, loc, SEEK_SET) != 0) + { + close(); + OJPH_ERROR(0x03000061, "Error seeking in file %s", fname); + } + size_t result = + fread(temp_buf, sizeof(float), (size_t)num_comps * (size_t)width, fh); + if (result != (size_t)num_comps * (size_t)width) + { + close(); + OJPH_ERROR(0x03000062, "Not enough data in file %s", fname); + } + if (++cur_line >= height) + cur_line = 0; + } + + union { + si32* s; + ui32* u; + float* f; + } sp, dp; + + if (little_endian) + { + ui32 shift = 32 - bit_depth[comp_num]; + sp.f = temp_buf + comp_num; + dp.f = line->f32; + if (shift) + for (ui32 i = width; i > 0; --i, sp.f += num_comps) + { + si32 s = *sp.s; + s >>= shift; + *dp.s++ = s; + } + else + for (ui32 i = width; i > 0; --i, sp.f += num_comps) + *dp.f++ = *sp.f; + } + else { + ui32 shift = 32 - bit_depth[comp_num]; + sp.f = temp_buf + comp_num; + dp.f = line->f32; + if (shift) + for (ui32 i = width; i > 0; --i, sp.f += num_comps) { + ui32 u = be2le(*sp.u); + si32 s = *(si32*)&u; + s >>= shift; + *dp.s++ = s; + } + else + for (ui32 i = width; i > 0; --i, sp.f += num_comps) + *dp.u++ = be2le(*sp.u); + } + + return width; + } + + //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + + //////////////////////////////////////////////////////////////////////////// + void pfm_out::open(char* filename) + { + assert(fh == NULL && buffer == NULL); + fh = fopen(filename, "wb"); + if (fh == NULL) + OJPH_ERROR(0x03000071, + "Unable to open file %s for writing", filename); + int result = //the number of written characters + fprintf(fh, "P%c\n%d %d\n%f\n", + num_components > 1 ? 'F' : 'f', width, height, scale); + if (result == 0) + OJPH_ERROR(0x03000072, "error writing to file %s", filename); + buffer_size = (size_t)width * num_components * sizeof(float); + buffer = (float*)malloc(buffer_size); + fname = filename; + cur_line = 0; + start_of_data = ojph_ftell(fh); + } + + //////////////////////////////////////////////////////////////////////////// + void pfm_out::configure(ui32 width, ui32 height, ui32 num_components, + float scale, ui32* bit_depth) + { + assert(fh == NULL); //configure before opening + if (num_components != 1 && num_components != 3) + OJPH_ERROR(0x03000081, + "pfm supports 1 or 3 colour components, not %d", num_components); + this->width = width; + this->height = height; + this->num_components = num_components; + this->scale = scale < 0.0f ? scale : -scale; + for (ui32 c = 0; c < num_components; ++c) + this->bit_depth[c] = bit_depth[c]; + } + + //////////////////////////////////////////////////////////////////////////// + ui32 pfm_out::write(const line_buf* line, ui32 comp_num) + { + assert(fh); + + ui32 shift = 32 - bit_depth[comp_num]; + union { + ui32* u; + float* f; + } sp, dp; + + dp.f = buffer + comp_num; + sp.f = line->f32; + + if (shift) + for (ui32 i = width; i > 0; --i, dp.f += num_components, ++sp.f) + { + ui32 u = *sp.u; + u <<= shift; + *dp.u = u; + } + else + for (ui32 i = width; i > 0; --i, dp.f += num_components) + *dp.f = *sp.f++; + + if (comp_num == num_components - 1) + { + size_t samples_per_line = num_components * (size_t)width; + si64 loc = start_of_data; + loc += (height - 1 - cur_line)* samples_per_line * sizeof(float); + if (ojph_fseek(fh, loc, SEEK_SET) != 0) + OJPH_ERROR(0x03000082, "Error seeking in file %s", fname); + size_t result = fwrite(buffer, sizeof(float), samples_per_line, fh); + if (result != samples_per_line) + OJPH_ERROR(0x03000083, "error writing to file %s", fname); + ++cur_line; + } + + return 0; + } + + //////////////////////////////////////////////////////////////////////////// // // // @@ -548,7 +793,7 @@ namespace ojph { { tiff_handle = NULL; if ((tiff_handle = TIFFOpen(filename, "r")) == NULL) - OJPH_ERROR(0x0300000B1, "Unable to open file %s", filename); + OJPH_ERROR(0x03000091, "Unable to open file %s", filename); fname = filename; ui32 tiff_width = 0; @@ -588,7 +833,7 @@ namespace ojph { // allocate linebuffer to hold a line of image data line_buffer = malloc(bytes_per_line); if (NULL == line_buffer) - OJPH_ERROR(0x0300000B2, "Unable to allocate %d bytes for line_buffer[] " + OJPH_ERROR(0x03000092, "Unable to allocate %d bytes for line_buffer[] " "for file %s", bytes_per_line, filename); cur_line = 0; @@ -596,7 +841,7 @@ namespace ojph { // Error on known incompatilbe input formats if( tiff_bits_per_sample != 8 && tiff_bits_per_sample != 16 ) { - OJPH_ERROR(0x0300000B3, "\nTIFF IO is currently limited" + OJPH_ERROR(0x03000093, "\nTIFF IO is currently limited" " to files with TIFFTAG_BITSPERSAMPLE=8 and TIFFTAG_BITSPERSAMPLE=16 \n" "input file = %s has TIFFTAG_BITSPERSAMPLE=%d", filename, tiff_bits_per_sample); @@ -604,14 +849,14 @@ namespace ojph { if( TIFFIsTiled( tiff_handle ) ) { - OJPH_ERROR(0x0300000B4, "\nTIFF IO is currently limited to TIF files " + OJPH_ERROR(0x03000094, "\nTIFF IO is currently limited to TIF files " "without tiles. \nInput file %s has been detected as tiled", filename); } if(PHOTOMETRIC_RGB != tiff_photometric && PHOTOMETRIC_MINISBLACK != tiff_photometric ) { - OJPH_ERROR(0x0300000B5, "\nTIFF IO is currently limited to " + OJPH_ERROR(0x03000095, "\nTIFF IO is currently limited to " "TIFFTAG_PHOTOMETRIC=PHOTOMETRIC_MINISBLACK=%d and " "PHOTOMETRIC_RGB=%d. \nInput file %s has been detected " "TIFFTAG_PHOTOMETRIC=%d", @@ -620,7 +865,7 @@ namespace ojph { if( tiff_samples_per_pixel > 4 ) { - OJPH_ERROR(0x0300000B6, "\nTIFF IO is currently limited to " + OJPH_ERROR(0x03000096, "\nTIFF IO is currently limited to " "TIFFTAG_SAMPLESPERPIXEL=4 \nInput file %s has been detected with " "TIFFTAG_SAMPLESPERPIXEL=%d", filename, tiff_samples_per_pixel); @@ -642,7 +887,7 @@ namespace ojph { line_buffer_for_planar_support_uint8 = (uint8_t*)calloc(width, sizeof(uint8_t)); if (NULL == line_buffer_for_planar_support_uint8) - OJPH_ERROR(0x0300000B7, "Unable to allocate %d bytes for " + OJPH_ERROR(0x03000097, "Unable to allocate %d bytes for " "line_buffer_for_planar_support_uint8[] for file %s", width * sizeof(uint8_t), filename); } @@ -652,7 +897,7 @@ namespace ojph { line_buffer_for_planar_support_uint16 = (uint16_t*)calloc(width, sizeof(uint16_t)); if (NULL == line_buffer_for_planar_support_uint16) - OJPH_ERROR(0x0300000B8, "Unable to allocate %d bytes for " + OJPH_ERROR(0x03000098, "Unable to allocate %d bytes for " "line_buffer_for_planar_support_uint16[] for file %s", width * sizeof(uint16_t), filename); } @@ -664,7 +909,7 @@ namespace ojph { void tif_in::set_bit_depth(ui32 num_bit_depths, ui32* bit_depth) { if (num_bit_depths < 1) - OJPH_ERROR(0x030000B9, "one or more bit_depths must be provided"); + OJPH_ERROR(0x030000A1, "one or more bit_depths must be provided"); ui32 last_bd_idx = 0; for (ui32 i = 0; i < 4; ++i) { @@ -673,7 +918,7 @@ namespace ojph { if (bd > 32 || bd < 1) { - OJPH_ERROR(0x0300000BA, + OJPH_ERROR(0x030000A2, "bit_depth = %d, this must be an integer from 1-32", bd); } this->bit_depth[i] = bd; @@ -690,12 +935,12 @@ namespace ojph { // the first time trying to access this line if (PLANARCONFIG_SEPARATE == planar_configuration && 0 == comp_num ) { - for (unsigned short color = 0; color < num_comps; color++) + for (ui32 color = 0; color < num_comps; color++) { if (bytes_per_sample == 1) { TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint8, - cur_line, color); + cur_line, (ui16)color); ui32 x = color; uint8_t* line_buffer_of_interleaved_components = (uint8_t*)line_buffer; @@ -708,7 +953,7 @@ namespace ojph { else if (bytes_per_sample == 2) { TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint16, - cur_line, color); + cur_line, (ui16)color); ui32 x = color; ui16* line_buffer_of_interleaved_components = (ui16*)line_buffer; for (ui32 i = 0; i < width; i++, x += num_comps) @@ -809,23 +1054,23 @@ namespace ojph { } if (max_bitdepth > 16) { - OJPH_WARN(0x0300000C2, "TIFF output is currently limited to files " + OJPH_WARN(0x030000B1, "TIFF output is currently limited to files " "with max_bitdepth = 16, the source codestream has max_bitdepth=%d" ", the decoded data will be truncated to 16 bits", max_bitdepth); } if (num_components > 4) { - OJPH_ERROR(0x0300000C3, "TIFF IO is currently limited to files with " + OJPH_ERROR(0x030000B2, "TIFF IO is currently limited to files with " "num_components=1 to 4"); } assert(tiff_handle == NULL && buffer == NULL); if ((tiff_handle = TIFFOpen(filename, "w")) == NULL) { - OJPH_ERROR(0x0300000C1, "unable to open file %s for writing", filename); + OJPH_ERROR(0x030000B3, "unable to open file %s for writing", filename); } - buffer_size = width * num_components * bytes_per_sample; + buffer_size = width * (size_t)num_components * (size_t)bytes_per_sample; buffer = (ui8*)malloc(buffer_size); fname = filename; cur_line = 0; @@ -901,7 +1146,7 @@ namespace ojph { bytes_per_sample = 2; } samples_per_line = num_components * width; - bytes_per_line = bytes_per_sample * samples_per_line; + bytes_per_line = bytes_per_sample * (size_t)samples_per_line; } @@ -1014,7 +1259,7 @@ namespace ojph { { int result = TIFFWriteScanline(tiff_handle, buffer, cur_line++); if (result != 1) - OJPH_ERROR(0x0300000C4, "error writing to file %s", fname); + OJPH_ERROR(0x030000C1, "error writing to file %s", fname); } return 0; } @@ -1034,7 +1279,7 @@ namespace ojph { assert(fh == NULL); fh = fopen(filename, "rb"); if (fh == 0) - OJPH_ERROR(0x03000051, "Unable to open file %s", filename); + OJPH_ERROR(0x030000D1, "Unable to open file %s", filename); //need to extract info from filename @@ -1062,7 +1307,7 @@ namespace ojph { if (result != width[comp_num]) { close(); - OJPH_ERROR(0x03000061, "not enough data in file %s", fname); + OJPH_ERROR(0x030000E1, "not enough data in file %s", fname); } if (bytes_per_sample[comp_num] == 1) @@ -1088,11 +1333,11 @@ namespace ojph { ui32 num_downsamplings, const point *subsampling) { if (num_components != 1 && num_components !=3) - OJPH_ERROR(0x03000071, "yuv_in support 1 or 3 components"); + OJPH_ERROR(0x030000F1, "yuv_in support 1 or 3 components"); this->num_com = num_components; if (num_downsamplings < 1) - OJPH_ERROR(0x03000072, "one or more downsampling must be provided"); + OJPH_ERROR(0x030000F2, "one or more downsampling must be provided"); ui32 last_downsamp_idx = 0; for (ui32 i = 0; i < num_components; ++i) @@ -1114,7 +1359,7 @@ namespace ojph { void yuv_in::set_bit_depth(ui32 num_bit_depths, ui32* bit_depth) { if (num_bit_depths < 1) - OJPH_ERROR(0x03000081, "one or more bit_depths must be provided"); + OJPH_ERROR(0x03000101, "one or more bit_depths must be provided"); ui32 last_bd_idx = 0; for (ui32 i = 0; i < 3; ++i) { @@ -1156,7 +1401,7 @@ namespace ojph { assert(fh == NULL); //configure before open fh = fopen(filename, "wb"); if (fh == 0) - OJPH_ERROR(0x03000091, "Unable to open file %s", filename); + OJPH_ERROR(0x03000111, "Unable to open file %s", filename); fname = filename; } @@ -1199,7 +1444,7 @@ namespace ojph { *dp++ = (ui16)val; } if (fwrite(buffer, 2, w, fh) != w) - OJPH_ERROR(0x030000A1, "unable to write to file %s", fname); + OJPH_ERROR(0x03000121, "unable to write to file %s", fname); } else { @@ -1213,7 +1458,7 @@ namespace ojph { *dp++ = (ui8)val; } if (fwrite(buffer, 1, w, fh) != w) - OJPH_ERROR(0x030000A2, "unable to write to file %s", fname); + OJPH_ERROR(0x03000122, "unable to write to file %s", fname); } return w; @@ -1233,11 +1478,11 @@ namespace ojph { assert(fh == NULL); fh = fopen(filename, "rb"); if (fh == NULL) - OJPH_ERROR(0x030000C1, "Unable to open file %s", filename); + OJPH_ERROR(0x03000131, "Unable to open file %s", filename); cur_line = 0; bytes_per_sample = (bit_depth + 7) >> 3; - buffer_size = width * bytes_per_sample; + buffer_size = (size_t)width * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); fname = filename; } @@ -1251,7 +1496,7 @@ namespace ojph { if (result != width) { close(); - OJPH_ERROR(0x030000C2, "not enough data in file %s", fname); + OJPH_ERROR(0x03000132, "not enough data in file %s", fname); } if (bytes_per_sample > 3) @@ -1360,7 +1605,7 @@ namespace ojph { assert(fh == NULL); //configure before open fh = fopen(filename, "wb"); if (fh == 0) - OJPH_ERROR(0x03000091, "Unable to open file %s", filename); + OJPH_ERROR(0x03000141, "Unable to open file %s", filename); fname = filename; } @@ -1373,11 +1618,11 @@ namespace ojph { this->width = width; if (is_signed) { - upper_val = (1 << (bit_depth - 1)); - lower_val = -(1 << (bit_depth - 1)); + upper_val = ((si64)1 << (bit_depth - 1)); + lower_val = -((si64)1 << (bit_depth - 1)); } else { - upper_val = 1 << bit_depth; - lower_val = 0; + upper_val = (si64)1 << bit_depth; + lower_val = (si64)0; } bytes_per_sample = (bit_depth + 7) >> 3; @@ -1392,63 +1637,127 @@ namespace ojph { assert(fh); assert(comp_num == 0); - if (bytes_per_sample > 3) + if (is_signed) { - const si32* sp = line->i32; - ui32* dp = (ui32*)buffer; - for (ui32 i = width; i > 0; --i) + if (bytes_per_sample > 3) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp++ = (ui32)val; + const si32* sp = line->i32; + si32* dp = (si32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (si32)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000151, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x030000B1, "unable to write to file %s", fname); - } - else if (bytes_per_sample > 2) - { - const si32* sp = line->i32; - ui32* dp = (ui32*)buffer; - for (ui32 i = width; i > 0; --i) + else if (bytes_per_sample > 2) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp = (ui32)val; - // this only works for little endian architecture - dp = (ui32*)((ui8*)dp + 3); + const si32* sp = line->i32; + si32* dp = (si32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp = (si32)val; + // this only works for little endian architecture + dp = (si32*)((ui8*)dp + 3); + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000152, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x030000B2, "unable to write to file %s", fname); - } - else if (bytes_per_sample > 1) - { - const si32* sp = line->i32; - ui16* dp = (ui16*)buffer; - for (ui32 i = width; i > 0; --i) + else if (bytes_per_sample > 1) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp++ = (ui16)val; + const si32* sp = line->i32; + si16* dp = (si16*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (si16)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000153, "unable to write to file %s", fname); + } + else + { + const si32* sp = line->i32; + si8* dp = (si8*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (si8)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000154, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x030000B3, "unable to write to file %s", fname); } - else + else { - const si32* sp = line->i32; - ui8* dp = (ui8*)buffer; - for (ui32 i = width; i > 0; --i) + if (bytes_per_sample > 3) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp++ = (ui8)val; + const ui32* sp = (ui32*)line->i32; + ui32* dp = (ui32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (ui32)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000155, "unable to write to file %s", fname); + } + else if (bytes_per_sample > 2) + { + const ui32* sp = (ui32*)line->i32; + ui32* dp = (ui32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp = (ui32)val; + // this only works for little endian architecture + dp = (ui32*)((ui8*)dp + 3); + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000156, "unable to write to file %s", fname); + } + else if (bytes_per_sample > 1) + { + const ui32* sp = (ui32*)line->i32; + ui16* dp = (ui16*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (ui16)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000157, "unable to write to file %s", fname); + } + else + { + const ui32* sp = (ui32*)line->i32; + ui8* dp = (ui8*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (ui8)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000158, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x030000B4, "unable to write to file %s", fname); } return width; @@ -1470,7 +1779,7 @@ namespace ojph { assert(file_handle == 0); file_handle = fopen(filename, "rb"); if (0 == file_handle) - OJPH_ERROR(0x0300000D1, "Unable to open file %s", filename); + OJPH_ERROR(0x03000161, "Unable to open file %s", filename); fname = filename; // read magic number @@ -1478,7 +1787,7 @@ namespace ojph { if (fread(&magic_number, sizeof(ui32), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000D2, "Error reading file %s", filename); + OJPH_ERROR(0x03000162, "Error reading file %s", filename); } // check magic number @@ -1497,7 +1806,7 @@ namespace ojph { else { close(); - OJPH_ERROR(0x0300000D3, "Error reading file %s - this does not appear " + OJPH_ERROR(0x03000163, "Error reading file %s - this does not appear " "to be a valid DPX file. It has magic number = 0x%08X. The magic " "number of a DPX file is 0x%08X.", filename, magic_number, dpx_magic_number); @@ -1508,7 +1817,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000D4, "Error reading file %s", filename); + OJPH_ERROR(0x03000164, "Error reading file %s", filename); } if (is_byte_swapping_necessary) offset_to_image_data_in_bytes = be2le(offset_to_image_data_in_bytes); @@ -1516,14 +1825,14 @@ namespace ojph { if (fread(version, sizeof(uint8_t), 8, file_handle) != 8) { close(); - OJPH_ERROR(0x0300000D5, "Error reading file %s", filename); + OJPH_ERROR(0x03000165, "Error reading file %s", filename); } // read image file size in bytes if (fread(&total_image_file_size_in_bytes, sizeof(ui32), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000D6, "Error reading file %s", filename); + OJPH_ERROR(0x03000166, "Error reading file %s", filename); } if (is_byte_swapping_necessary) total_image_file_size_in_bytes = be2le(total_image_file_size_in_bytes); @@ -1532,14 +1841,14 @@ namespace ojph { if (fseek(file_handle,768, SEEK_SET) != 0) { close(); - OJPH_ERROR(0x0300000D7, "Error reading file %s", filename); + OJPH_ERROR(0x03000167, "Error reading file %s", filename); } // read image_orientation if (fread(&image_orientation, sizeof(uint16_t), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000D8, "Error reading file %s", filename); + OJPH_ERROR(0x03000168, "Error reading file %s", filename); } if (is_byte_swapping_necessary) image_orientation = be2le(image_orientation); @@ -1549,7 +1858,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000D9, "Error reading file %s", filename); + OJPH_ERROR(0x03000169, "Error reading file %s", filename); } if (is_byte_swapping_necessary) number_of_image_elements = be2le(number_of_image_elements); @@ -1558,7 +1867,7 @@ namespace ojph { if (fread(&pixels_per_line, sizeof(ui32), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000DA, "Error reading file %s", filename); + OJPH_ERROR(0x0300016A, "Error reading file %s", filename); } if (is_byte_swapping_necessary) pixels_per_line = be2le(pixels_per_line); @@ -1567,7 +1876,7 @@ namespace ojph { if (fread(&lines_per_image_element, sizeof(ui32), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000DB, "Error reading file %s", filename); + OJPH_ERROR(0x0300016B, "Error reading file %s", filename); } if (is_byte_swapping_necessary) lines_per_image_element = be2le(lines_per_image_element); @@ -1576,7 +1885,7 @@ namespace ojph { if (fseek(file_handle, 780, SEEK_SET) != 0) { close(); - OJPH_ERROR(0x0300000DC, "Error reading file %s", filename); + OJPH_ERROR(0x0300016C, "Error reading file %s", filename); } // read data sign for image element @@ -1584,7 +1893,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000DE, "Error reading file %s", filename); + OJPH_ERROR(0x0300016E, "Error reading file %s", filename); } if (is_byte_swapping_necessary) data_sign_for_image_element_1 = be2le(data_sign_for_image_element_1); @@ -1593,7 +1902,7 @@ namespace ojph { if (fseek(file_handle, 800, SEEK_SET) != 0) { close(); - OJPH_ERROR(0x0300000DF, "Error reading file %s", filename); + OJPH_ERROR(0x0300016F, "Error reading file %s", filename); } // read descriptor @@ -1601,7 +1910,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000E0, "Error reading file %s", filename); + OJPH_ERROR(0x03000170, "Error reading file %s", filename); } // read transfer characteristic @@ -1609,7 +1918,7 @@ namespace ojph { 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000E1, "Error reading file %s", filename); + OJPH_ERROR(0x03000171, "Error reading file %s", filename); } // read colorimetric specification @@ -1617,7 +1926,7 @@ namespace ojph { 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000E2, "Error reading file %s", filename); + OJPH_ERROR(0x03000172, "Error reading file %s", filename); } // read bit depth @@ -1625,7 +1934,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000E3, "Error reading file %s", filename); + OJPH_ERROR(0x03000173, "Error reading file %s", filename); } // read packing @@ -1633,7 +1942,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000E4, "Error reading file %s", filename); + OJPH_ERROR(0x03000174, "Error reading file %s", filename); } if (is_byte_swapping_necessary) packing_for_image_element_1 = be2le(packing_for_image_element_1); @@ -1643,7 +1952,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000E5, "Error reading file %s", filename); + OJPH_ERROR(0x03000175, "Error reading file %s", filename); } if (is_byte_swapping_necessary) encoding_for_image_element_1 = be2le(encoding_for_image_element_1); @@ -1653,7 +1962,7 @@ namespace ojph { file_handle) != 1) { close(); - OJPH_ERROR(0x0300000E6, "Error reading file %s", filename); + OJPH_ERROR(0x03000176, "Error reading file %s", filename); } if (is_byte_swapping_necessary) offset_to_data_for_image_element_1 = @@ -1663,7 +1972,7 @@ namespace ojph { if (fseek(file_handle, (long)offset_to_image_data_in_bytes, SEEK_SET) != 0) { close(); - OJPH_ERROR(0x0300000E7, "Error reading file %s", filename); + OJPH_ERROR(0x03000177, "Error reading file %s", filename); } // set ojph properties @@ -1689,17 +1998,17 @@ namespace ojph { // allocate linebuffer to hold a line of image data from the file line_buffer = malloc(number_of_32_bit_words_per_line * sizeof(ui32) ); if (NULL == line_buffer) - OJPH_ERROR(0x0300000E8, "Unable to allocate %d bytes for line_buffer[] " + OJPH_ERROR(0x03000178, "Unable to allocate %d bytes for line_buffer[] " "for file %s", number_of_32_bit_words_per_line * sizeof(ui32), filename); // allocate line_buffer_16bit_samples to hold a line of image data in memory line_buffer_16bit_samples = - (ui16*) malloc(width * num_comps * sizeof(ui16)); + (ui16*) malloc((size_t)width * num_comps * sizeof(ui16)); if (NULL == line_buffer_16bit_samples) - OJPH_ERROR(0x0300000E9, "Unable to allocate %d bytes for " + OJPH_ERROR(0x03000179, "Unable to allocate %d bytes for " "line_buffer_16bit_samples[] for file %s", - width * num_comps * sizeof(ui16), filename); + (size_t)width * num_comps * sizeof(ui16), filename); cur_line = 0; @@ -1719,7 +2028,7 @@ namespace ojph { file_handle) != number_of_32_bit_words_per_line) { close(); - OJPH_ERROR(0x0300000F1, "Error reading file %s", fname); + OJPH_ERROR(0x03000181, "Error reading file %s", fname); } if (true == is_byte_swapping_necessary) @@ -1773,7 +2082,7 @@ namespace ojph { } else { - OJPH_ERROR(0x0300000F2, "file %s uses DPX image formats that are not " + OJPH_ERROR(0x03000182, "file %s uses DPX image formats that are not " "yet supported by this software\n bitdepth_for_image_element_1 = " "%d\n num_comps=%d\npacking_for_image_element_1=%d\n " "descriptor_for_image_element_1=%d", fname, diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 555de0e..1c6856a 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -10,6 +10,7 @@ file(GLOB CODESTREAM_WASM "codestream/*_wasm.cpp") file(GLOB CODING "coding/*.cpp" "coding/*.h") file(GLOB CODING_SSSE3 "coding/*_ssse3.cpp") file(GLOB CODING_WASM "coding/*_wasm.cpp") +file(GLOB CODING_AVX2 "coding/*_avx2.cpp") file(GLOB CODING_AVX512 "coding/*_avx512.cpp") file(GLOB COMMON "common/*.h") file(GLOB OTHERS "others/*.cpp") @@ -22,7 +23,7 @@ file(GLOB TRANSFORM_AVX512 "transform/*_avx512.cpp") file(GLOB TRANSFORM_WASM "transform/*_wasm.cpp") list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM}) -list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX512}) +list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX2} ${CODING_AVX512}) list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512} ${TRANSFORM_WASM}) list(APPEND SOURCES ${CODESTREAM} ${CODING} ${COMMON} ${OTHERS} ${TRANSFORM}) @@ -70,9 +71,10 @@ else() source_group("transform" FILES ${TRANSFORM_AVX}) endif() if (NOT OJPH_DISABLE_AVX2) - list(APPEND SOURCES ${CODESTREAM_AVX2} ${TRANSFORM_AVX2}) + list(APPEND SOURCES ${CODESTREAM_AVX2} ${TRANSFORM_AVX2} ${CODING_AVX2}) source_group("codestream" FILES ${CODESTREAM_AVX2}) source_group("transform" FILES ${TRANSFORM_AVX2}) + source_group("coding" FILES ${CODING_AVX2}) endif() if ((NOT OJPH_DISABLE_AVX512) AND ("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64")) list(APPEND SOURCES ${CODING_AVX512} ${TRANSFORM_AVX512}) @@ -84,6 +86,8 @@ else() if (MSVC) set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX") set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") + set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") + set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512") set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX") set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") @@ -94,6 +98,8 @@ else() set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS -mavx) set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) set_source_files_properties(coding/ojph_block_decoder_ssse3.cpp PROPERTIES COMPILE_FLAGS -mssse3) + set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) + set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512cd) set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS -mavx) set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) diff --git a/src/core/codestream/ojph_bitbuffer_write.h b/src/core/codestream/ojph_bitbuffer_write.h index d5b6bca..ecb9dd2 100644 --- a/src/core/codestream/ojph_bitbuffer_write.h +++ b/src/core/codestream/ojph_bitbuffer_write.h @@ -109,33 +109,25 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline + void bb_put_zeros(bit_write_buf *bbp, int num_zeros, + mem_elastic_allocator *elastic, + coded_lists*& cur_coded_list, ui32& ph_bytes) + { + for (int i = num_zeros; i > 0; --i) + bb_put_bit(bbp, 0, elastic, cur_coded_list, ph_bytes); + } + ////////////////////////////////////////////////////////////////////////// static inline void bb_put_bits(bit_write_buf *bbp, ui32 data, int num_bits, mem_elastic_allocator *elastic, coded_lists*& cur_coded_list, ui32& ph_bytes) { -// assert(num_bits <= 32); - for (int i = num_bits - 1; i >= 0; --i) + assert(num_bits <= 32); + for (int i = num_bits - 1; i >= 0; --i) bb_put_bit(bbp, data >> i, elastic, cur_coded_list, ph_bytes); -// while (num_bits) { -// int tx_bits = num_bits < bbp->avail_bits ? num_bits : bbp->avail_bits; -// bbp->tmp |= (data >> (num_bits - tx_bits)) & ((1 << tx_bits) - 1); -// bbp->avail_bits -= tx_bits; -// if (bbp->avail_bits <= 0) -// { -// bbp->avail_bits = 8 - (bbp->tmp != 0xFF ? 0 : 1); -// bbp->buf[bbp->buf_size - bbp->avail_size] = (ui8)(bbp->tmp & 0xFF); -// bbp->tmp = 0; -// --bbp->avail_size; -// if (bbp->avail_size == 0) -// { -// bb_expand_buf(bbp, elastic, cur_coded_list->next_list); -// cur_coded_list = cur_coded_list->next_list; -// ph_bytes += bit_buffer::needed; -// } -// } -// } } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp index 9a63ca1..351284b 100644 --- a/src/core/codestream/ojph_codeblock.cpp +++ b/src/core/codestream/ojph_codeblock.cpp @@ -45,6 +45,7 @@ #include "ojph_codestream_local.h" #include "ojph_codeblock.h" #include "ojph_subband.h" +#include "ojph_resolution.h" namespace ojph { @@ -52,7 +53,7 @@ namespace ojph { { ////////////////////////////////////////////////////////////////////////// - void codeblock::pre_alloc(codestream *codestream, + void codeblock::pre_alloc(codestream *codestream, ui32 comp_num, const size& nominal) { mem_fixed_allocator* allocator = codestream->get_allocator(); @@ -60,7 +61,14 @@ namespace ojph { assert(byte_alignment / sizeof(ui32) > 1); const ui32 f = byte_alignment / sizeof(ui32) - 1; ui32 stride = (nominal.w + f) & ~f; // a multiple of 8 - allocator->pre_alloc_data(nominal.h * stride, 0); + + const param_siz* sz = codestream->get_siz(); + const param_cod* cd = codestream->get_cod(comp_num); + ui32 precision = cd->propose_implementation_precision(sz); + if (precision <= 32) + allocator->pre_alloc_data(nominal.h * (size_t)stride, 0); + else + allocator->pre_alloc_data(nominal.h * (size_t)stride, 0); } ////////////////////////////////////////////////////////////////////////// @@ -75,7 +83,19 @@ namespace ojph { const ui32 f = byte_alignment / sizeof(ui32) - 1; this->stride = (nominal.w + f) & ~f; // a multiple of 8 this->buf_size = this->stride * nominal.h; - this->buf = allocator->post_alloc_data(this->buf_size, 0); + + ui32 comp_num = parent->get_parent()->get_comp_num(); + const param_siz* sz = codestream->get_siz(); + const param_cod* cd = codestream->get_cod(comp_num); + ui32 bit_depth = cd->propose_implementation_precision(sz); + if (bit_depth <= 32) { + precision = BUF32; + this->buf32 = allocator->post_alloc_data(this->buf_size, 0); + } + else { + precision = BUF64; + this->buf64 = allocator->post_alloc_data(this->buf_size, 0); + } this->nominal_size = nominal; this->cb_size = cb_size; @@ -85,8 +105,8 @@ namespace ojph { this->delta = parent->get_delta(); this->delta_inv = 1.0f / this->delta; this->K_max = K_max; - for (int i = 0; i < 8; ++i) - this->max_val[i] = 0; + for (int i = 0; i < 4; ++i) + this->max_val64[i] = 0; ojph::param_cod cod = codestream->access_cod(); this->reversible = cod.is_reversible(); this->resilient = codestream->is_resilient(); @@ -100,28 +120,61 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void codeblock::push(line_buf *line) { - // convert to sign and magnitude and keep max_val - const si32 *sp = line->i32 + line_offset; - ui32 *dp = buf + cur_line * stride; - this->codeblock_functions.tx_to_cb(sp, dp, K_max, delta_inv, cb_size.w, - max_val); - ++cur_line; + // convert to sign and magnitude and keep max_val + if (precision == BUF32) + { + assert(line->flags & line_buf::LFT_32BIT); + const si32 *sp = line->i32 + line_offset; + ui32 *dp = buf32 + cur_line * stride; + this->codeblock_functions.tx_to_cb32(sp, dp, K_max, delta_inv, + cb_size.w, max_val32); + ++cur_line; + } + else + { + assert(precision == BUF64); + assert(line->flags & line_buf::LFT_64BIT); + const si64 *sp = line->i64 + line_offset; + ui64 *dp = buf64 + cur_line * stride; + this->codeblock_functions.tx_to_cb64(sp, dp, K_max, delta_inv, + cb_size.w, max_val64); + ++cur_line; + } } ////////////////////////////////////////////////////////////////////////// void codeblock::encode(mem_elastic_allocator *elastic) { - ui32 mv = this->codeblock_functions.find_max_val(max_val); - if (mv >= 1u<<(31 - K_max)) + if (precision == BUF32) { - coded_cb->missing_msbs = K_max - 1; - assert(coded_cb->missing_msbs > 0); - assert(coded_cb->missing_msbs < K_max); - coded_cb->num_passes = 1; - - this->codeblock_functions.encode_cb(buf, K_max-1, 1, - cb_size.w, cb_size.h, stride, coded_cb->pass_length, - elastic, coded_cb->next_coded); + ui32 mv = this->codeblock_functions.find_max_val32(max_val32); + if (mv >= 1u << (31 - K_max)) + { + coded_cb->missing_msbs = K_max - 1; + assert(coded_cb->missing_msbs > 0); + assert(coded_cb->missing_msbs < K_max); + coded_cb->num_passes = 1; + + this->codeblock_functions.encode_cb32(buf32, K_max-1, 1, + cb_size.w, cb_size.h, stride, coded_cb->pass_length, + elastic, coded_cb->next_coded); + } + } + else + { + assert(precision == BUF64); + ui64 mv = this->codeblock_functions.find_max_val64(max_val64); + if (mv >= 1ULL << (63 - K_max)) + { + coded_cb->missing_msbs = K_max - 1; + assert(coded_cb->missing_msbs > 0); + assert(coded_cb->missing_msbs < K_max); + coded_cb->num_passes = 1; + + this->codeblock_functions.encode_cb64(buf64, K_max-1, 1, + cb_size.w, cb_size.h, stride, coded_cb->pass_length, + elastic, coded_cb->next_coded); + } } } @@ -132,8 +185,8 @@ namespace ojph { this->cb_size = cb_size; this->coded_cb = coded_cb; this->cur_line = 0; - for (int i = 0; i < 8; ++i) - this->max_val[i] = 0; + for (int i = 0; i < 4; ++i) + this->max_val64[i] = 0; this->zero_block = false; } @@ -143,20 +196,33 @@ namespace ojph { if (coded_cb->pass_length[0] > 0 && coded_cb->num_passes > 0 && coded_cb->next_coded != NULL) { - bool result = this->codeblock_functions.decode_cb( + bool result; + if (precision == BUF32) + { + result = this->codeblock_functions.decode_cb32( + coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size, + buf32, coded_cb->missing_msbs, coded_cb->num_passes, + coded_cb->pass_length[0], coded_cb->pass_length[1], + cb_size.w, cb_size.h, stride, stripe_causal); + } + else + { + assert(precision == BUF64); + result = this->codeblock_functions.decode_cb64( coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size, - buf, coded_cb->missing_msbs, coded_cb->num_passes, + buf64, coded_cb->missing_msbs, coded_cb->num_passes, coded_cb->pass_length[0], coded_cb->pass_length[1], cb_size.w, cb_size.h, stride, stripe_causal); + } if (result == false) { if (resilient == true) { - OJPH_INFO(0x000300A1, "Error decoding a codeblock"); + OJPH_INFO(0x000300A1, "Error decoding a codeblock."); zero_block = true; } else - OJPH_ERROR(0x000300A1, "Error decoding a codeblock"); + OJPH_ERROR(0x000300A1, "Error decoding a codeblock."); } } else @@ -167,15 +233,35 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void codeblock::pull_line(line_buf *line) { - si32 *dp = line->i32 + line_offset; - if (!zero_block) + //convert to sign and magnitude + if (precision == BUF32) { - //convert to sign and magnitude - const ui32 *sp = buf + cur_line * stride; - this->codeblock_functions.tx_from_cb(sp, dp, K_max, delta, cb_size.w); + assert(line->flags & line_buf::LFT_32BIT); + si32 *dp = line->i32 + line_offset; + if (!zero_block) + { + const ui32 *sp = buf32 + cur_line * stride; + this->codeblock_functions.tx_from_cb32(sp, dp, K_max, delta, + cb_size.w); + } + else + this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(ui32)); } else - this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp)); + { + assert(precision == BUF64); + assert(line->flags & line_buf::LFT_64BIT); + si64 *dp = line->i64 + line_offset; + if (!zero_block) + { + const ui64 *sp = buf64 + cur_line * stride; + this->codeblock_functions.tx_from_cb64(sp, dp, K_max, delta, + cb_size.w); + } + else + this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp)); + } + ++cur_line; assert(cur_line <= cb_size.h); } diff --git a/src/core/codestream/ojph_codeblock.h b/src/core/codestream/ojph_codeblock.h index 2f7d8e7..fde8e6a 100644 --- a/src/core/codestream/ojph_codeblock.h +++ b/src/core/codestream/ojph_codeblock.h @@ -48,7 +48,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_elastic_allocator; class codestream; struct coded_lists; @@ -65,8 +65,14 @@ namespace ojph { class codeblock { friend struct precinct; + enum : ui32 { + BUF32 = 4, + BUF64 = 8, + }; + public: - static void pre_alloc(codestream *codestream, const size& nominal); + static void pre_alloc(codestream *codestream, ui32 comp_num, + const size& nominal); void finalize_alloc(codestream *codestream, subband* parent, const size& nominal, const size& cb_size, coded_cb_header* coded_cb, @@ -79,7 +85,11 @@ namespace ojph { void pull_line(line_buf *line); private: - ui32* buf; + ui32 precision; + union { + ui32* buf32; + ui64* buf64; + }; size nominal_size; size cb_size; ui32 stride; @@ -93,7 +103,10 @@ namespace ojph { bool resilient; bool stripe_causal; bool zero_block; // true when the decoded block is all zero - ui32 max_val[8]; // supports up to 256 bits + union { + ui32 max_val32[8]; // supports up to 256 bits + ui64 max_val64[4]; // supports up to 256 bits + }; coded_cb_header* coded_cb; codeblock_fun codeblock_functions; }; diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index cf51530..08d8d73 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -63,72 +63,107 @@ namespace ojph { void wasm_mem_clear(void* addr, size_t count); ////////////////////////////////////////////////////////////////////////// - ui32 gen_find_max_val(ui32* address); - ui32 sse2_find_max_val(ui32* address); - ui32 avx2_find_max_val(ui32* address); - ui32 wasm_find_max_val(ui32* address); + ui32 gen_find_max_val32(ui32* address); + ui32 sse2_find_max_val32(ui32* address); + ui32 avx2_find_max_val32(ui32* address); + ui32 wasm_find_max_val32(ui32* address); + ui64 gen_find_max_val64(ui64* address); + ui64 sse2_find_max_val64(ui64* address); + ui64 avx2_find_max_val64(ui64* address); + ui64 wasm_find_max_val64(ui64* address); + ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); + void gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + + void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); + void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); void codeblock_fun::init(bool reversible) { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) // Default path, no acceleration. We may change this later - decode_cb = ojph_decode_codeblock; - find_max_val = gen_find_max_val; + decode_cb32 = ojph_decode_codeblock32; + find_max_val32 = gen_find_max_val32; mem_clear = gen_mem_clear; if (reversible) { - tx_to_cb = gen_rev_tx_to_cb; - tx_from_cb = gen_rev_tx_from_cb; + tx_to_cb32 = gen_rev_tx_to_cb32; + tx_from_cb32 = gen_rev_tx_from_cb32; } else { - tx_to_cb = gen_irv_tx_to_cb; - tx_from_cb = gen_irv_tx_from_cb; + tx_to_cb32 = gen_irv_tx_to_cb32; + tx_from_cb32 = gen_irv_tx_from_cb32; } - encode_cb = ojph_encode_codeblock; + encode_cb32 = ojph_encode_codeblock32; + + decode_cb64 = ojph_decode_codeblock64; + find_max_val64 = gen_find_max_val64; + if (reversible) { + tx_to_cb64 = gen_rev_tx_to_cb64; + tx_from_cb64 = gen_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; + } + encode_cb64 = ojph_encode_codeblock64; #ifndef OJPH_DISABLE_SIMD #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) - // Accelerated functions for INTEL/AMD CPUs + // Accelerated functions for INTEL/AMD CPUs #ifndef OJPH_DISABLE_SSE if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) mem_clear = sse_mem_clear; @@ -136,21 +171,31 @@ namespace ojph { #ifndef OJPH_DISABLE_SSE2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) { - find_max_val = sse2_find_max_val; + find_max_val32 = sse2_find_max_val32; if (reversible) { - tx_to_cb = sse2_rev_tx_to_cb; - tx_from_cb = sse2_rev_tx_from_cb; + tx_to_cb32 = sse2_rev_tx_to_cb32; + tx_from_cb32 = sse2_rev_tx_from_cb32; } else { - tx_to_cb = sse2_irv_tx_to_cb; - tx_from_cb = sse2_irv_tx_from_cb; + tx_to_cb32 = sse2_irv_tx_to_cb32; + tx_from_cb32 = sse2_irv_tx_from_cb32; + } + find_max_val64 = sse2_find_max_val64; + if (reversible) { + tx_to_cb64 = sse2_rev_tx_to_cb64; + tx_from_cb64 = sse2_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; } } #endif // !OJPH_DISABLE_SSE2 #ifndef OJPH_DISABLE_SSSE3 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSSE3) - decode_cb = ojph_decode_codeblock_ssse3; + decode_cb32 = ojph_decode_codeblock_ssse3; #endif // !OJPH_DISABLE_SSSE3 #ifndef OJPH_DISABLE_AVX @@ -160,21 +205,39 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { - find_max_val = avx2_find_max_val; + decode_cb32 = ojph_decode_codeblock_avx2; + find_max_val32 = avx2_find_max_val32; if (reversible) { - tx_to_cb = avx2_rev_tx_to_cb; - tx_from_cb = avx2_rev_tx_from_cb; + tx_to_cb32 = avx2_rev_tx_to_cb32; + tx_from_cb32 = avx2_rev_tx_from_cb32; } else { - tx_to_cb = avx2_irv_tx_to_cb; - tx_from_cb = avx2_irv_tx_from_cb; + tx_to_cb32 = avx2_irv_tx_to_cb32; + tx_from_cb32 = avx2_irv_tx_from_cb32; + } + encode_cb32 = ojph_encode_codeblock_avx2; + bool result = initialize_block_encoder_tables_avx2(); + assert(result); ojph_unused(result); + + find_max_val64 = avx2_find_max_val64; + if (reversible) { + tx_to_cb64 = avx2_rev_tx_to_cb64; + tx_from_cb64 = avx2_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; } } #endif // !OJPH_DISABLE_AVX2 #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) - encode_cb = ojph_encode_codeblock_avx512; + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) { + encode_cb32 = ojph_encode_codeblock_avx512; + bool result = initialize_block_encoder_tables_avx512(); + assert(result); ojph_unused(result); + } #endif // !OJPH_DISABLE_AVX512 #elif defined(OJPH_ARCH_ARM) @@ -186,18 +249,31 @@ namespace ojph { #else // OJPH_ENABLE_WASM_SIMD // Accelerated functions for WASM SIMD. - decode_cb = ojph_decode_codeblock_wasm; - find_max_val = wasm_find_max_val; + decode_cb32 = ojph_decode_codeblock_wasm; + find_max_val32 = wasm_find_max_val32; mem_clear = wasm_mem_clear; if (reversible) { - tx_to_cb = wasm_rev_tx_to_cb; - tx_from_cb = wasm_rev_tx_from_cb; + tx_to_cb32 = wasm_rev_tx_to_cb32; + tx_from_cb32 = wasm_rev_tx_from_cb32; } else { - tx_to_cb = wasm_irv_tx_to_cb; - tx_from_cb = wasm_irv_tx_from_cb; + tx_to_cb32 = wasm_irv_tx_to_cb32; + tx_from_cb32 = wasm_irv_tx_from_cb32; + } + encode_cb32 = ojph_encode_codeblock32; + + decode_cb64 = ojph_decode_codeblock64; + find_max_val64 = wasm_find_max_val64; + if (reversible) { + tx_to_cb64 = wasm_rev_tx_to_cb64; + tx_from_cb64 = wasm_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; } - encode_cb = ojph_encode_codeblock; + encode_cb64 = ojph_encode_codeblock64; #endif // !OJPH_ENABLE_WASM_SIMD diff --git a/src/core/codestream/ojph_codeblock_fun.h b/src/core/codestream/ojph_codeblock_fun.h index 679b2d3..67fbc2b 100644 --- a/src/core/codestream/ojph_codeblock_fun.h +++ b/src/core/codestream/ojph_codeblock_fun.h @@ -51,23 +51,40 @@ namespace ojph { typedef void (*mem_clear_fun)(void* addr, size_t count); // define function signature for max value finding - typedef ui32 (*find_max_val_fun)(ui32* addr); + typedef ui32 (*find_max_val_fun32)(ui32* addr); + + typedef ui64 (*find_max_val_fun64)(ui64* addr); // define line transfer function signature from subbands to codeblocks - typedef void (*tx_to_cb_fun)(const void *sp, ui32 *dp, ui32 K_max, + typedef void (*tx_to_cb_fun32)(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32* max_val); + typedef void (*tx_to_cb_fun64)(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + // define line transfer function signature from codeblock to subband - typedef void (*tx_from_cb_fun)(const ui32 *sp, void *dp, ui32 K_max, + typedef void (*tx_from_cb_fun32)(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + + typedef void (*tx_from_cb_fun64)(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count); // define the block decoder function signature - typedef bool (*cb_decoder_fun)(ui8* coded_data, ui32* decoded_data, + typedef bool (*cb_decoder_fun32)(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, bool stripe_causal); + + typedef bool (*cb_decoder_fun64)(ui8* coded_data, ui64* decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal); // define the block encoder function signature - typedef void (*cb_encoder_fun)(ui32* buf, ui32 missing_msbs, + typedef void (*cb_encoder_fun32)(ui32* buf, ui32 missing_msbs, + ui32 num_passes, ui32 width, ui32 height, ui32 stride, + ui32* lengths, ojph::mem_elastic_allocator* elastic, + ojph::coded_lists*& coded); + + typedef void (*cb_encoder_fun64)(ui64* buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32* lengths, ojph::mem_elastic_allocator* elastic, ojph::coded_lists*& coded); @@ -81,19 +98,24 @@ namespace ojph { mem_clear_fun mem_clear; // a pointer to the max value finding function - find_max_val_fun find_max_val; + find_max_val_fun32 find_max_val32; + find_max_val_fun64 find_max_val64; // a pointer to function transferring samples from subbands to codeblocks - tx_to_cb_fun tx_to_cb; + tx_to_cb_fun32 tx_to_cb32; + tx_to_cb_fun64 tx_to_cb64; // a pointer to function transferring samples from codeblocks to subbands - tx_from_cb_fun tx_from_cb; + tx_from_cb_fun32 tx_from_cb32; + tx_from_cb_fun64 tx_from_cb64; // a pointer to the decoder function - cb_decoder_fun decode_cb; + cb_decoder_fun32 decode_cb32; + cb_decoder_fun64 decode_cb64; // a pointer to the encoder function - cb_encoder_fun encode_cb; + cb_encoder_fun32 encode_cb32; + cb_encoder_fun64 encode_cb64; }; } diff --git a/src/core/codestream/ojph_codestream.cpp b/src/core/codestream/ojph_codestream.cpp index 06f6b56..f2832ac 100644 --- a/src/core/codestream/ojph_codestream.cpp +++ b/src/core/codestream/ojph_codestream.cpp @@ -84,6 +84,12 @@ namespace ojph { return param_qcd(&state->qcd); } + //////////////////////////////////////////////////////////////////////////// + param_nlt codestream::access_nlt() + { + return param_nlt(&state->nlt); + } + //////////////////////////////////////////////////////////////////////////// void codestream::set_planar(bool planar) { diff --git a/src/core/codestream/ojph_codestream_avx2.cpp b/src/core/codestream/ojph_codestream_avx2.cpp index 04a81ed..a8e5138 100644 --- a/src/core/codestream/ojph_codestream_avx2.cpp +++ b/src/core/codestream/ojph_codestream_avx2.cpp @@ -35,6 +35,7 @@ // Date: 15 May 2022 //***************************************************************************/ +#include #include #include "ojph_defs.h" @@ -42,7 +43,7 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - ui32 avx2_find_max_val(ui32* address) + ui32 avx2_find_max_val32(ui32* address) { __m128i x0 = _mm_loadu_si128((__m128i*)address); __m128i x1 = _mm_loadu_si128((__m128i*)address + 1); @@ -56,14 +57,26 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + ui64 avx2_find_max_val64(ui64* address) + { + __m128i x0 = _mm_loadu_si128((__m128i*)address); + __m128i x1 = _mm_loadu_si128((__m128i*)address + 1); + x0 = _mm_or_si128(x0, x1); + x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3] + x0 = _mm_or_si128(x0, x1); + ui64 t = (ui64)_mm_extract_epi64(x0, 0); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(delta_inv); // convert to sign and magnitude and keep max_val ui32 shift = 31 - K_max; - __m256i m0 = _mm256_set1_epi32((int)0x80000000); + __m256i m0 = _mm256_set1_epi32(INT_MIN); __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); __m256i *p = (__m256i*)sp; for (ui32 i = 0; i < count; i += 8, p += 1, dp += 8) @@ -78,16 +91,16 @@ namespace ojph { } _mm256_storeu_si256((__m256i*)max_val, tmax); } - + ////////////////////////////////////////////////////////////////////////// - void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(K_max); //quantize and convert to sign and magnitude and keep max_val __m256 d = _mm256_set1_ps(delta_inv); - __m256i m0 = _mm256_set1_epi32((int)0x80000000); + __m256i m0 = _mm256_set1_epi32(INT_MIN); __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); float *p = (float*)sp; @@ -106,29 +119,29 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; - __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF); + __m256i m1 = _mm256_set1_epi32(INT_MAX); si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8) { - __m256i v = _mm256_load_si256((__m256i*)sp); - __m256i val = _mm256_and_si256(v, m1); - val = _mm256_srli_epi32(val, (int)shift); - val = _mm256_sign_epi32(val, v); - _mm256_storeu_si256((__m256i*)p, val); + __m256i v = _mm256_load_si256((__m256i*)sp); + __m256i val = _mm256_and_si256(v, m1); + val = _mm256_srli_epi32(val, (int)shift); + val = _mm256_sign_epi32(val, v); + _mm256_storeu_si256((__m256i*)p, val); } } ////////////////////////////////////////////////////////////////////////// - void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); - __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF); + __m256i m1 = _mm256_set1_epi32(INT_MAX); __m256 d = _mm256_set1_ps(delta); float *p = (float*)dp; for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8) @@ -142,5 +155,58 @@ namespace ojph { _mm256_storeu_ps(p, valf); } } + + ////////////////////////////////////////////////////////////////////////// + void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + __m256i m0 = _mm256_set1_epi64x(LLONG_MIN); + __m256i zero = _mm256_setzero_si256(); + __m256i one = _mm256_set1_epi64x(1); + __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); + __m256i *p = (__m256i*)sp; + for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4) + { + __m256i v = _mm256_loadu_si256(p); + __m256i sign = _mm256_cmpgt_epi64(zero, v); + __m256i val = _mm256_xor_si256(v, sign); // negate 1's complement + __m256i ones = _mm256_and_si256(sign, one); + val = _mm256_add_epi64(val, ones); // 2's complement + sign = _mm256_and_si256(sign, m0); + val = _mm256_slli_epi64(val, (int)shift); + tmax = _mm256_or_si256(tmax, val); + val = _mm256_or_si256(val, sign); + _mm256_storeu_si256((__m256i*)dp, val); + } + _mm256_storeu_si256((__m256i*)max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + + ui32 shift = 63 - K_max; + __m256i m1 = _mm256_set1_epi64x(LLONG_MAX); + __m256i zero = _mm256_setzero_si256(); + __m256i one = _mm256_set1_epi64x(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) + { + __m256i v = _mm256_load_si256((__m256i*)sp); + __m256i val = _mm256_and_si256(v, m1); + val = _mm256_srli_epi64(val, (int)shift); + __m256i sign = _mm256_cmpgt_epi64(zero, v); + val = _mm256_xor_si256(val, sign); // negate 1's complement + __m256i ones = _mm256_and_si256(sign, one); + val = _mm256_add_epi64(val, ones); // 2's complement + _mm256_storeu_si256((__m256i*)p, val); + } + } } -} \ No newline at end of file +} diff --git a/src/core/codestream/ojph_codestream_gen.cpp b/src/core/codestream/ojph_codestream_gen.cpp index 466f483..cdc72c6 100644 --- a/src/core/codestream/ojph_codestream_gen.cpp +++ b/src/core/codestream/ojph_codestream_gen.cpp @@ -44,18 +44,21 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void gen_mem_clear(void* addr, size_t count) { - ui32* p = (ui32*)addr; - for (size_t i = 0; i < count; i += 4, p += 1) - *p = 0; + si64* p = (si64*)addr; + for (size_t i = 0; i < count; i += 8) + *p++ = 0; } ////////////////////////////////////////////////////////////////////////// - ui32 gen_find_max_val(ui32* addr) { return addr[0]; } + ui32 gen_find_max_val32(ui32* addr) { return addr[0]; } ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, - ui32* max_val) + ui64 gen_find_max_val64(ui64* addr) { return addr[0]; } + + ////////////////////////////////////////////////////////////////////////// + void gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, + ui32* max_val) { ojph_unused(delta_inv); ui32 shift = 31 - K_max; @@ -65,7 +68,7 @@ namespace ojph { for (ui32 i = count; i > 0; --i) { si32 v = *p++; - ui32 sign = v >= 0 ? 0 : 0x80000000; + ui32 sign = v >= 0 ? 0U : 0x80000000U; ui32 val = (ui32)(v >= 0 ? v : -v); val <<= shift; *dp++ = sign | val; @@ -75,9 +78,31 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, - ui32* max_val) + void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, + ui64* max_val) + { + ojph_unused(delta_inv); + ui32 shift = 63 - K_max; + // convert to sign and magnitude and keep max_val + ui64 tmax = *max_val; + si64 *p = (si64*)sp; + for (ui32 i = count; i > 0; --i) + { + si64 v = *p++; + ui64 sign = v >= 0 ? 0ULL : 0x8000000000000000ULL; + ui64 val = (ui64)(v >= 0 ? v : -v); + val <<= shift; + *dp++ = sign | val; + tmax |= val; // it is more efficient to use or than max + } + *max_val = tmax; + } + + ////////////////////////////////////////////////////////////////////////// + void gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, + ui32* max_val) { ojph_unused(K_max); //quantize and convert to sign and magnitude and keep max_val @@ -87,7 +112,7 @@ namespace ojph { { float v = *p++; si32 t = ojph_trunc(v * delta_inv); - ui32 sign = t >= 0 ? 0 : 0x80000000; + ui32 sign = t >= 0 ? 0U : 0x80000000U; ui32 val = (ui32)(t >= 0 ? t : -t); *dp++ = sign | val; tmax |= val; // it is more efficient to use or than max @@ -96,8 +121,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; @@ -106,14 +131,30 @@ namespace ojph { for (ui32 i = count; i > 0; --i) { ui32 v = *sp++; - si32 val = (v & 0x7FFFFFFF) >> shift; - *p++ = (v & 0x80000000) ? -val : val; + si32 val = (v & 0x7FFFFFFFU) >> shift; + *p++ = (v & 0x80000000U) ? -val : val; + } + } + + ////////////////////////////////////////////////////////////////////////// + void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + //convert to sign and magnitude + si64 *p = (si64*)dp; + for (ui32 i = count; i > 0; --i) + { + ui64 v = *sp++; + si64 val = (v & 0x7FFFFFFFFFFFFFFFULL) >> shift; + *p++ = (v & 0x8000000000000000ULL) ? -val : val; } } ////////////////////////////////////////////////////////////////////////// - void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); //convert to sign and magnitude @@ -121,8 +162,8 @@ namespace ojph { for (ui32 i = count; i > 0; --i) { ui32 v = *sp++; - float val = (float)(v & 0x7FFFFFFF) * delta; - *p++ = (v & 0x80000000) ? -val : val; + float val = (float)(v & 0x7FFFFFFFU) * delta; + *p++ = (v & 0x80000000U) ? -val : val; } } diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp index 8279466..7a114b7 100644 --- a/src/core/codestream/ojph_codestream_local.cpp +++ b/src/core/codestream/ojph_codestream_local.cpp @@ -550,6 +550,7 @@ namespace ojph { cod.update_atk(atk); qcd.check_validity(siz, cod); cap.check_validity(cod, qcd); + nlt.check_validity(siz); if (profile == OJPH_PN_IMF) check_imf_validity(); else if (profile == OJPH_PN_BROADCAST) @@ -632,6 +633,9 @@ namespace ojph { if (!qcd.write(file)) OJPH_ERROR(0x00030026, "Error writing to file"); + if (!nlt.write(file)) + OJPH_ERROR(0x00030027, "Error writing to file"); + char buf[] = " OpenJPH Ver " OJPH_INT_TO_STRING(OPENJPH_VERSION_MAJOR) "." OJPH_INT_TO_STRING(OPENJPH_VERSION_MINOR) "." @@ -642,23 +646,23 @@ namespace ojph { //1 for General use (IS 8859-15:1999 (Latin) values) *(ui16*)(buf + 4) = swap_byte((ui16)(1)); if (file->write(buf, len) != len) - OJPH_ERROR(0x00030027, "Error writing to file"); + OJPH_ERROR(0x00030028, "Error writing to file"); if (comments != NULL) { for (ui32 i = 0; i < num_comments; ++i) { t = swap_byte(JP2K_MARKER::COM); if (file->write(&t, 2) != 2) - OJPH_ERROR(0x00030028, "Error writing to file"); + OJPH_ERROR(0x00030029, "Error writing to file"); t = swap_byte((ui16)(comments[i].len + 4)); if (file->write(&t, 2) != 2) - OJPH_ERROR(0x00030029, "Error writing to file"); + OJPH_ERROR(0x0003002A, "Error writing to file"); //1 for General use (IS 8859-15:1999 (Latin) values) t = swap_byte(comments[i].Rcom); if (file->write(&t, 2) != 2) - OJPH_ERROR(0x0003002A, "Error writing to file"); - if (file->write(comments[i].data, comments[i].len)!=comments[i].len) OJPH_ERROR(0x0003002B, "Error writing to file"); + if (file->write(comments[i].data, comments[i].len)!=comments[i].len) + OJPH_ERROR(0x0003002C, "Error writing to file"); } } } @@ -728,8 +732,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void codestream::read_headers(infile_base *file) { - ui16 marker_list[19] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC, - RGN, POC, PPM, TLM, PLM, CRG, COM, DFS, ATK, SOT }; + ui16 marker_list[20] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC, + RGN, POC, PPM, TLM, PLM, CRG, COM, DFS, ATK, NLT, SOT }; find_marker(file, marker_list, 1); //find SOC find_marker(file, marker_list + 1, 1); //find SIZ siz.read(file); @@ -737,7 +741,7 @@ namespace ojph { int received_markers = 0; //check that COD, & QCD received while (true) { - marker_idx = find_marker(file, marker_list + 2, 17); + marker_idx = find_marker(file, marker_list + 2, 18); if (marker_idx == 0) cap.read(file); else if (marker_idx == 1) @@ -813,6 +817,8 @@ namespace ojph { else if (marker_idx == 15) atk[2].read(file); else if (marker_idx == 16) + nlt.read(file); + else if (marker_idx == 17) break; else OJPH_ERROR(0x00030051, "File ended before finding a tile segment"); @@ -902,19 +908,20 @@ namespace ojph { } bool sod_found = false; - ui16 other_tile_part_markers[6] = { SOT, POC, PPT, PLT, COM, SOD }; + ui16 other_tile_part_markers[7] = { SOT, POC, PPT, PLT, COM, + NLT, SOD }; while (true) { int marker_idx = 0; int result = 0; - marker_idx = find_marker(infile, other_tile_part_markers + 1, 5); + marker_idx = find_marker(infile, other_tile_part_markers + 1, 6); if (marker_idx == 0) result = skip_marker(infile, "POC", - "POC in a tile is not supported yet", + "POC marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 1) result = skip_marker(infile, "PPT", - "PPT in a tile is not supported yet", + "PPT marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 2) //Skipping PLT marker segment;this should not cause any issues @@ -924,6 +931,10 @@ namespace ojph { result = skip_marker(infile, "COM", NULL, OJPH_MSG_LEVEL::NO_MSG, resilient); else if (marker_idx == 4) + result = skip_marker(infile, "NLT", + "NLT marker in tile is not supported yet", + OJPH_MSG_LEVEL::WARN, resilient); + else if (marker_idx == 5) { sod_found = true; break; @@ -961,40 +972,40 @@ namespace ojph { else { //first tile part bool sod_found = false; - ui16 first_tile_part_markers[11] = { SOT, COD, COC, QCD, QCC, RGN, - POC, PPT, PLT, COM, SOD }; + ui16 first_tile_part_markers[12] = { SOT, COD, COC, QCD, QCC, RGN, + POC, PPT, PLT, COM, NLT, SOD }; while (true) { int marker_idx = 0; int result = 0; - marker_idx = find_marker(infile, first_tile_part_markers+1, 10); + marker_idx = find_marker(infile, first_tile_part_markers+1, 11); if (marker_idx == 0) result = skip_marker(infile, "COD", - "COD in a tile is not supported yet", + "COD marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 1) result = skip_marker(infile, "COC", - "COC in a tile is not supported yet", + "COC marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 2) result = skip_marker(infile, "QCD", - "QCD in a tile is not supported yet", + "QCD marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 3) result = skip_marker(infile, "QCC", - "QCC in a tile is not supported yet", + "QCC marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 4) result = skip_marker(infile, "RGN", - "RGN in a tile is not supported yet", + "RGN marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 5) result = skip_marker(infile, "POC", - "POC in a tile is not supported yet", + "POC marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 6) result = skip_marker(infile, "PPT", - "PPT in a tile is not supported yet", + "PPT marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 7) //Skipping PLT marker segment;this should not cause any issues @@ -1004,6 +1015,10 @@ namespace ojph { result = skip_marker(infile, "COM", NULL, OJPH_MSG_LEVEL::NO_MSG, resilient); else if (marker_idx == 9) + result = skip_marker(infile, "NLT", + "PPT marker segment in a tile is not supported yet", + OJPH_MSG_LEVEL::WARN, resilient); + else if (marker_idx == 10) { sod_found = true; break; diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h index 8ca8c71..3d03658 100644 --- a/src/core/codestream/ojph_codestream_local.h +++ b/src/core/codestream/ojph_codestream_local.h @@ -46,7 +46,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_fixed_allocator; class mem_elastic_allocator; class codestream; @@ -96,6 +96,8 @@ namespace ojph { } const param_dfs* access_dfs() { if (dfs.exists()) return &dfs; else return NULL; } + const param_nlt* get_nlt() + { return ≮ } mem_fixed_allocator* get_allocator() { return allocator; } mem_elastic_allocator* get_elastic_alloc() { return elastic_alloc; } outfile_base* get_file() { return outfile; } @@ -161,6 +163,7 @@ namespace ojph { param_cap cap; // extended capabilities param_qcd qcd; // quantization default param_tlm tlm; // tile-part lengths + param_nlt nlt; // non-linearity point transformation private: // this is to handle qcc and coc int used_qcc_fields; diff --git a/src/core/codestream/ojph_codestream_sse.cpp b/src/core/codestream/ojph_codestream_sse.cpp index 7c64ad9..6a31cbd 100644 --- a/src/core/codestream/ojph_codestream_sse.cpp +++ b/src/core/codestream/ojph_codestream_sse.cpp @@ -49,6 +49,5 @@ namespace ojph { for (size_t i = 0; i < count; i += 16, p += 4) _mm_storeu_ps(p, zero); } - } } \ No newline at end of file diff --git a/src/core/codestream/ojph_codestream_sse2.cpp b/src/core/codestream/ojph_codestream_sse2.cpp index 9bb0643..3352bcd 100644 --- a/src/core/codestream/ojph_codestream_sse2.cpp +++ b/src/core/codestream/ojph_codestream_sse2.cpp @@ -35,6 +35,7 @@ // Date: 15 May 2022 //***************************************************************************/ +#include #include #include "ojph_defs.h" @@ -42,7 +43,7 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - ui32 sse2_find_max_val(ui32* address) + ui32 sse2_find_max_val32(ui32* address) { __m128i x1, x0 = _mm_loadu_si128((__m128i*)address); x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3] @@ -59,14 +60,29 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + ui64 sse2_find_max_val64(ui64* address) + { + __m128i x1, x0 = _mm_loadu_si128((__m128i*)address); + x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3] + x0 = _mm_or_si128(x0, x1); + _mm_storeu_si128((__m128i*)address, x0); + return *address; + // A single movd t, xmm0 can do the trick, but it is not available + // in SSE2 intrinsics. extract_epi32 is available in sse4.1 + // ui32 t = (ui32)_mm_extract_epi16(x0, 0); + // t |= (ui32)_mm_extract_epi16(x0, 1) << 16; + // return t; + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(delta_inv); // convert to sign and magnitude and keep max_val ui32 shift = 31 - K_max; - __m128i m0 = _mm_set1_epi32((int)0x80000000); + __m128i m0 = _mm_set1_epi32(INT_MIN); __m128i zero = _mm_setzero_si128(); __m128i one = _mm_set1_epi32(1); __m128i tmax = _mm_loadu_si128((__m128i*)max_val); @@ -88,8 +104,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(K_max); @@ -118,34 +134,34 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; - __m128i m1 = _mm_set1_epi32(0x7FFFFFFF); + __m128i m1 = _mm_set1_epi32(INT_MAX); __m128i zero = _mm_setzero_si128(); __m128i one = _mm_set1_epi32(1); si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) { - __m128i v = _mm_load_si128((__m128i*)sp); - __m128i val = _mm_and_si128(v, m1); - val = _mm_srli_epi32(val, (int)shift); - __m128i sign = _mm_cmplt_epi32(v, zero); - val = _mm_xor_si128(val, sign); // negate 1's complement - __m128i ones = _mm_and_si128(sign, one); - val = _mm_add_epi32(val, ones); // 2's complement - _mm_storeu_si128((__m128i*)p, val); + __m128i v = _mm_load_si128((__m128i*)sp); + __m128i val = _mm_and_si128(v, m1); + val = _mm_srli_epi32(val, (int)shift); + __m128i sign = _mm_cmplt_epi32(v, zero); + val = _mm_xor_si128(val, sign); // negate 1's complement + __m128i ones = _mm_and_si128(sign, one); + val = _mm_add_epi32(val, ones); // 2's complement + _mm_storeu_si128((__m128i*)p, val); } } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); - __m128i m1 = _mm_set1_epi32(0x7FFFFFFF); + __m128i m1 = _mm_set1_epi32(INT_MAX); __m128 d = _mm_set1_ps(delta); float *p = (float*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) @@ -159,5 +175,59 @@ namespace ojph { _mm_storeu_ps(p, valf); } } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + __m128i m0 = _mm_set1_epi64x(LLONG_MIN); + __m128i zero = _mm_setzero_si128(); + __m128i one = _mm_set1_epi64x(1); + __m128i tmax = _mm_loadu_si128((__m128i*)max_val); + __m128i *p = (__m128i*)sp; + for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2) + { + __m128i v = _mm_loadu_si128(p); + __m128i sign = _mm_cmplt_epi32(v, zero); + sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3]; + __m128i val = _mm_xor_si128(v, sign); // negate 1's complement + __m128i ones = _mm_and_si128(sign, one); + val = _mm_add_epi64(val, ones); // 2's complement + sign = _mm_and_si128(sign, m0); + val = _mm_slli_epi64(val, (int)shift); + tmax = _mm_or_si128(tmax, val); + val = _mm_or_si128(val, sign); + _mm_storeu_si128((__m128i*)dp, val); + } + _mm_storeu_si128((__m128i*)max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + __m128i m1 = _mm_set1_epi64x(LLONG_MAX); + __m128i zero = _mm_setzero_si128(); + __m128i one = _mm_set1_epi64x(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2) + { + __m128i v = _mm_load_si128((__m128i*)sp); + __m128i val = _mm_and_si128(v, m1); + val = _mm_srli_epi64(val, (int)shift); + __m128i sign = _mm_cmplt_epi32(v, zero); + sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3]; + val = _mm_xor_si128(val, sign); // negate 1's complement + __m128i ones = _mm_and_si128(sign, one); + val = _mm_add_epi64(val, ones); // 2's complement + _mm_storeu_si128((__m128i*)p, val); + } + } } } \ No newline at end of file diff --git a/src/core/codestream/ojph_codestream_wasm.cpp b/src/core/codestream/ojph_codestream_wasm.cpp index 19e47aa..e2cd444 100644 --- a/src/core/codestream/ojph_codestream_wasm.cpp +++ b/src/core/codestream/ojph_codestream_wasm.cpp @@ -35,6 +35,7 @@ // Date: 15 May 2022 //***************************************************************************/ +#include #include #include @@ -43,20 +44,17 @@ namespace ojph { namespace local { - ////////////////////////////////////////////////////////////////////////// - #define REPEAT(a) a,a,a,a - ////////////////////////////////////////////////////////////////////////// void wasm_mem_clear(void* addr, size_t count) { float* p = (float*)addr; - v128_t zero = wasm_i32x4_const(REPEAT(0)); + v128_t zero = wasm_i32x4_splat(0); for (size_t i = 0; i < count; i += 16, p += 4) wasm_v128_store(p, zero); } ////////////////////////////////////////////////////////////////////////// - ui32 wasm_find_max_val(ui32* address) + ui32 wasm_find_max_val32(ui32* address) { v128_t x1, x0 = wasm_v128_load(address); x1 = wasm_i32x4_shuffle(x0, x0, 2, 3, 2, 3); // x1 = x0[2,3,2,3] @@ -68,19 +66,29 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + ui64 wasm_find_max_val64(ui64* address) + { + v128_t x1, x0 = wasm_v128_load(address); + x1 = wasm_i64x2_shuffle(x0, x0, 1, 1); // x1 = x0[2,3,2,3] + x0 = wasm_v128_or(x0, x1); + ui64 t = (ui64)wasm_i64x2_extract_lane(x0, 0); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(delta_inv); // convert to sign and magnitude and keep max_val ui32 shift = 31 - K_max; - v128_t m0 = wasm_i32x4_const(REPEAT((int)0x80000000)); - v128_t zero = wasm_i32x4_const(REPEAT(0)); - v128_t one = wasm_i32x4_const(REPEAT(1)); + v128_t m0 = wasm_i32x4_splat(INT_MIN); + v128_t zero = wasm_i32x4_splat(0); + v128_t one = wasm_i32x4_splat(1); v128_t tmax = wasm_v128_load(max_val); - v128_t *p = (v128_t*)sp; - for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4) + si32 *p = (si32*)sp; + for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4) { v128_t v = wasm_v128_load(p); v128_t sign = wasm_i32x4_lt(v, zero); @@ -97,16 +105,16 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(K_max); //quantize and convert to sign and magnitude and keep max_val v128_t d = wasm_f32x4_splat(delta_inv); - v128_t zero = wasm_i32x4_const(REPEAT(0)); - v128_t one = wasm_i32x4_const(REPEAT(1)); + v128_t zero = wasm_i32x4_splat(0); + v128_t one = wasm_i32x4_splat(1); v128_t tmax = wasm_v128_load(max_val); float *p = (float*)sp; for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4) @@ -127,14 +135,14 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; - v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF)); - v128_t zero = wasm_i32x4_const(REPEAT(0)); - v128_t one = wasm_i32x4_const(REPEAT(1)); + v128_t m1 = wasm_i32x4_splat(INT_MAX); + v128_t zero = wasm_i32x4_splat(0); + v128_t one = wasm_i32x4_splat(1); si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) { @@ -150,11 +158,11 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); - v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF)); + v128_t m1 = wasm_i32x4_splat(INT_MAX); v128_t d = wasm_f32x4_splat(delta); float *p = (float*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) @@ -167,6 +175,58 @@ namespace ojph { valf = wasm_v128_or(valf, sign); wasm_v128_store(p, valf); } - } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + v128_t m0 = wasm_i64x2_splat(LLONG_MIN); + v128_t zero = wasm_i64x2_splat(0); + v128_t one = wasm_i64x2_splat(1); + v128_t tmax = wasm_v128_load(max_val); + si64 *p = (si64*)sp; + for (ui32 i = 0; i < count; i += 2, p += 2, dp += 2) + { + v128_t v = wasm_v128_load(p); + v128_t sign = wasm_i64x2_lt(v, zero); + v128_t val = wasm_v128_xor(v, sign); // negate 1's complement + v128_t ones = wasm_v128_and(sign, one); + val = wasm_i64x2_add(val, ones); // 2's complement + sign = wasm_v128_and(sign, m0); + val = wasm_i64x2_shl(val, shift); + tmax = wasm_v128_or(tmax, val); + val = wasm_v128_or(val, sign); + wasm_v128_store(dp, val); + } + wasm_v128_store(max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + v128_t m1 = wasm_i64x2_splat(LLONG_MAX); + v128_t zero = wasm_i64x2_splat(0); + v128_t one = wasm_i64x2_splat(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2) + { + v128_t v = wasm_v128_load((v128_t*)sp); + v128_t val = wasm_v128_and(v, m1); + val = wasm_i64x2_shr(val, shift); + v128_t sign = wasm_i64x2_lt(v, zero); + val = wasm_v128_xor(val, sign); // negate 1's complement + v128_t ones = wasm_v128_and(sign, one); + val = wasm_i64x2_add(val, ones); // 2's complement + wasm_v128_store(p, val); + } + } } } \ No newline at end of file diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index b6ada17..8a234e5 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -372,6 +372,27 @@ namespace ojph { // //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + void param_nlt::set_type3_transformation(ui32 comp_num, bool enable) + { + state->set_type3_transformation(comp_num, enable); + } + + ////////////////////////////////////////////////////////////////////////// + bool param_nlt::get_type3_transformation(ui32 comp_num, ui8& bit_depth, + bool& is_signed) + { + return state->get_type3_transformation(comp_num, bit_depth, is_signed); + } + + //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void comment_exchange::set_string(const char* str) { @@ -611,7 +632,7 @@ namespace ojph { if ((Rsiz & 0x4000) == 0) OJPH_ERROR(0x00050044, "Rsiz bit 14 is not set (this is not a JPH file)"); - if ((Rsiz & 0x8000) != 0 && (Rsiz & 0xF5F) != 0) + if ((Rsiz & 0x8000) != 0 && (Rsiz & 0xD5F) != 0) OJPH_WARN(0x00050001, "Rsiz in SIZ has unimplemented fields"); if (file->read(&Xsiz, 4) != 4) OJPH_ERROR(0x00050045, "error reading SIZ marker"); @@ -755,6 +776,25 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + ui32 + param_cod::propose_implementation_precision(const param_siz* siz) const + { + bool employing_color_transform = is_employing_color_transform() ? 1 : 0; + bool reversible = atk->is_reversible(); + + ui32 bit_depth = 32; + if (reversible) { + bit_depth = siz->get_bit_depth(comp_num); + bit_depth += comp_num < 3 ? employing_color_transform : 0; + // 3 or 4 is how many extra bits are needed for the HH band at the + // bottom most level of decomposition. + bit_depth += get_num_decompositions() > 5 ? 4 : 3; + } + + return bit_depth; + } + ////////////////////////////////////////////////////////////////////////// bool param_cod::write(outfile_base *file) { @@ -908,24 +948,46 @@ namespace ojph { void param_qcd::set_rev_quant(ui32 num_decomps, ui32 bit_depth, bool is_employing_color_transform) { - int guard_bits = 1; - Sqcd = (ui8)(guard_bits << 5); //one guard bit, and no quantization ui32 B = bit_depth; B += is_employing_color_transform ? 1 : 0; //1 bit for RCT int s = 0; - float bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true); - //we leave some leeway for numerical error by multiplying by 1.1f - ui32 X = (ui32) ceil(log(bibo_l * bibo_l * 1.1f) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << 3); + double bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true); + ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2); + u8_SPqcd[s++] = (ui8)(B + X); + ui32 max_B_plus_X = (ui32)(B + X); for (ui32 d = num_decomps; d > 0; --d) { - float bibo_l = bibo_gains::get_bibo_gain_l(d, true); - float bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true); - X = (ui32) ceil(log(bibo_h * bibo_l * 1.1f) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << 3); - u8_SPqcd[s++] = (ui8)((B + X) << 3); - X = (ui32) ceil(log(bibo_h * bibo_h * 1.1f) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << 3); + double bibo_l = bibo_gains::get_bibo_gain_l(d, true); + double bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true); + X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2); + u8_SPqcd[s++] = (ui8)(B + X); + max_B_plus_X = ojph_max(max_B_plus_X, B + X); + u8_SPqcd[s++] = (ui8)(B + X); + max_B_plus_X = ojph_max(max_B_plus_X, B + X); + X = (ui32) ceil(log(bibo_h * bibo_h) / M_LN2); + u8_SPqcd[s++] = (ui8)(B + X); + max_B_plus_X = ojph_max(max_B_plus_X, B + X); + } + + if (max_B_plus_X > 38) + OJPH_ERROR(0x00050151, "The specified combination of bit_depth, " + "colour transform, and type of wavelet transform requires more than " + "38 bits; it requires %d bits. This is beyond what is allowed in " + "the JPEG2000 image coding format.", max_B_plus_X); + + int guard_bits = ojph_max(1, (si32)max_B_plus_X - 31); + Sqcd = (ui8)(guard_bits << 5); + s = 0; + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; + for (ui32 d = num_decomps; d > 0; --d) + { + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; } } @@ -981,8 +1043,11 @@ namespace ojph { ui32 B = 0; int irrev = Sqcd & 0x1F; if (irrev == 0) //reversible - for (ui32 i = 0; i < num_subbands; ++i) - B = ojph_max(B, (u8_SPqcd[i] >> 3) + get_num_guard_bits() - 1u); + for (ui32 i = 0; i < num_subbands; ++i) { + ui32 t = decode_SPqcd(u8_SPqcd[i]); + t += get_num_guard_bits() - 1u; + B = ojph_max(B, t); + } else if (irrev == 2) //scalar expounded for (ui32 i = 0; i < num_subbands; ++i) { @@ -1052,9 +1117,9 @@ namespace ojph { } int irrev = Sqcd & 0x1F; - if (irrev == 0) //reversible; this is (10.22) from the J2K book + if (irrev == 0) // reversible; this is (10.22) from the J2K book { - num_bits += u8_SPqcd[idx] >> 3; + num_bits += decode_SPqcd(u8_SPqcd[idx]); num_bits = num_bits == 0 ? 0 : num_bits - 1; } else if (irrev == 1) @@ -1214,6 +1279,239 @@ namespace ojph { OJPH_ERROR(0x000500AA, "wrong Sqcc value in QCC marker"); } + ////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + ////////////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////////////////////////// + void param_nlt::check_validity(param_siz& siz) + { + if (is_any_enabled() == false) + return; + + bool all_same = true; + ui32 num_comps = siz.get_num_components(); + + // first stage; find out if all components captured by the default + // entry (ALL_COMPS) has the same bit_depth/signedness, + // while doing this, set the BDnlt for components not captured but the + // default entry (ALL_COMPS) + ui32 bit_depth = 0; // unknown yet + bool is_signed = false; // unknown yet + for (ui32 c = 0; c < num_comps; ++c) + { + param_nlt* p = get_comp_object(c); + if (p == NULL || !p->enabled) // comp is not in list or not enabled + { + if (bit_depth == 0) + { // this is the first component which has not type 3 nlt definition + bit_depth = siz.get_bit_depth(c); + is_signed = siz.is_signed(c); + } + else + { // we have seen an undefined component previously + all_same = all_same && (bit_depth == siz.get_bit_depth(c)); + all_same = all_same && (is_signed == siz.is_signed(c)); + } + } + else + { + p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1); + p->BDnlt = (ui8)(p->BDnlt | (siz.is_signed(c) ? 0x80 : 0)); + } + } + + // If the default entry is enabled/used, then if the components captured + // by it are not the same, we need to create entries for these + // components + if (this->enabled) + { + if (bit_depth != 0) // default captures some components + { + // captures at least one of the componets in the default entry + this->BDnlt = (ui8)((bit_depth - 1) | (is_signed ? 0x80 : (ui8)0)); + + if (!all_same) + { + // We cannot use the default for all components in it, so we + // will keep the first one, and we will also define other + // components on their own. + + for (ui32 c = 0; c < num_comps; ++c) + { + ui32 bd = siz.get_bit_depth(c); + bool is = siz.is_signed(c); + if (bd != bit_depth || is != is_signed) + { + // this component has different bit_depth/signedness than the + // default (ALL_COMPS) entry + param_nlt* p = get_comp_object(c); + if (p == NULL || !p->enabled) + { + // this component is captured by the default (ALL_COMPS) + // entry (because it is either not in the list, or + // not enabled + if (p == NULL) + p = add_object(c); + p->enabled = true; + p->BDnlt = (ui8)((bd - 1) | (is ? 0x80 : 0)); + } + } + } + } + } + else + this->enabled = false; + } + + trim_non_existing_components(num_comps); + + if (is_any_enabled() == false) + return; + siz.set_Rsiz_flag(param_siz::RSIZ_EXT_FLAG | param_siz::RSIZ_NLT_FLAG); + } + + ////////////////////////////////////////////////////////////////////////// + void param_nlt::set_type3_transformation(ui32 comp_num, bool enable) + { + param_nlt* p = get_comp_object(comp_num); + if (p == NULL) + p = add_object(comp_num); + p->enabled = enable; + } + + ////////////////////////////////////////////////////////////////////////// + bool param_nlt::get_type3_transformation(ui32 comp_num, ui8& bit_depth, + bool& is_signed) const + { + const param_nlt* p = get_comp_object(comp_num); + p = p ? p : this; + if (p->enabled) + { + bit_depth = (ui8)((p->BDnlt & 0x7F) + 1); + bit_depth = bit_depth <= 38 ? bit_depth : 38; + is_signed = (p->BDnlt & 0x80) == 0x80; + } + return p->enabled; + } + + ////////////////////////////////////////////////////////////////////////// + bool param_nlt::write(outfile_base* file) const + { + if (is_any_enabled() == false) + return true; + + char buf[2]; + bool result = true; + const param_nlt* p = this; + while (p) + { + if (p->enabled) + { + *(ui16*)buf = JP2K_MARKER::NLT; + *(ui16*)buf = swap_byte(*(ui16*)buf); + result &= file->write(&buf, 2) == 2; + *(ui16*)buf = swap_byte(p->Lnlt); + result &= file->write(&buf, 2) == 2; + *(ui16*)buf = swap_byte(p->Cnlt); + result &= file->write(&buf, 2) == 2; + result &= file->write(&p->BDnlt, 1) == 1; + result &= file->write(&p->Tnlt, 1) == 1; + } + p = p->next; + } + return result; + } + + ////////////////////////////////////////////////////////////////////////// + void param_nlt::read(infile_base* file) + { + ui8 buf[6]; + + if (file->read(buf, 6) != 6) + OJPH_ERROR(0x00050141, "error reading NLT marker segment"); + + ui16 length = swap_byte(*(ui16*)buf); + if (length != 6 || buf[5] != 3) // wrong length or type + OJPH_ERROR(0x00050142, "Unsupported NLT type %d\n", buf[5]); + + ui16 comp = swap_byte(*(ui16*)(buf + 2)); + param_nlt* p = this; + if (comp != special_comp_num::ALL_COMPS) + { + p = get_comp_object(comp); + if (p == NULL) + p = add_object(comp); + } + p->enabled = true; + p->Cnlt = comp; + p->BDnlt = buf[4]; + } + + ////////////////////////////////////////////////////////////////////////// + param_nlt* param_nlt::get_comp_object(ui32 comp_num) + { + // cast object to constant + const param_nlt* const_p = const_cast(this); + // call using the constant object, then cast to non-const + return const_cast(const_p->get_comp_object(comp_num)); + } + + ////////////////////////////////////////////////////////////////////////// + const param_nlt* param_nlt::get_comp_object(ui32 comp_num) const + { + if (Cnlt == comp_num) + return this; + else { + param_nlt* p = next; + while (p && p->Cnlt != comp_num) + p = p->next; + return p; + } + } + + ////////////////////////////////////////////////////////////////////////// + param_nlt* param_nlt::add_object(ui32 comp_num) + { + assert(Cnlt != comp_num); + param_nlt* p = this; + while (p->next != NULL) { + assert(p->Cnlt != comp_num); + p = p->next; + } + p->next = new param_nlt; + p->alloced_next = true; + p = p->next; + p->Cnlt = (ui16)comp_num; + return p; + } + + ////////////////////////////////////////////////////////////////////////// + bool param_nlt::is_any_enabled() const + { + // check if any field is enabled + const param_nlt* p = this; + while (p && p->enabled == false) + p = p->next; + return (p != NULL); + } + + ////////////////////////////////////////////////////////////////////////// + void param_nlt::trim_non_existing_components(ui32 num_comps) + { + param_nlt* p = this->next; + while (p) { + if (p->enabled == true && p->Cnlt >= num_comps) + p->enabled = false; + p = p->next; + } + } + + ////////////////////////////////////////////////////////////////////////// // // @@ -1239,10 +1537,8 @@ namespace ojph { result &= file->write(&buf, 2) == 2; *(ui32*)buf = swap_byte(Psot); result &= file->write(&buf, 4) == 4; - *(ui8*)buf = TPsot; - result &= file->write(&buf, 1) == 1; - *(ui8*)buf = TNsot; - result &= file->write(&buf, 1) == 1; + result &= file->write(&TPsot, 1) == 1; + result &= file->write(&TNsot, 1) == 1; return result; } @@ -1263,10 +1559,8 @@ namespace ojph { result &= file->write(&buf, 2) == 2; *(ui32*)buf = swap_byte(payload_len + 14); result &= file->write(&buf, 4) == 4; - *(ui8*)buf = TPsot; - result &= file->write(&buf, 1) == 1; - *(ui8*)buf = TNsot; - result &= file->write(&buf, 1) == 1; + result &= file->write(&TPsot, 1) == 1; + result &= file->write(&TNsot, 1) == 1; return result; } @@ -1363,7 +1657,7 @@ namespace ojph { "In any case, this limit means that we have 10922 " "tileparts or more, which is a huge number."); this->num_pairs = num_pairs; - pairs = (Ttlm_Ptlm_pair*)store; + pairs = store; Ltlm = (ui16)(4 + 6 * num_pairs); Ztlm = 0; Stlm = 0x60; diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 1958b8e..cce5cd8 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -138,6 +138,7 @@ namespace ojph { COM = 0xFF64, //comment DFS = 0xFF72, //downsampling factor styles ADS = 0xFF73, //arbitrary decomposition styles + NLT = 0xFF76, //non-linearity point transformation ATK = 0xFF79, //arbitrary transformation kernels SOT = 0xFF90, //start of tile-part SOP = 0xFF91, //start of packet @@ -165,13 +166,26 @@ namespace ojph { { friend ::ojph::param_siz; + public: + enum : ui16 { + RSIZ_NLT_FLAG = 0x200, + RSIZ_HT_FLAG = 0x4000, + RSIZ_EXT_FLAG = 0x8000, + }; + public: param_siz() { - memset(this, 0, sizeof(param_siz)); + Lsiz = Csiz = 0; + Xsiz = Ysiz = XOsiz = YOsiz = XTsiz = YTsiz = XTOsiz = YTOsiz = 0; + skipped_resolutions = 0; + memset(store, 0, sizeof(store)); + ws_kern_support_needed = dfs_support_needed = false; + cod = NULL; + dfs = NULL; + Rsiz = RSIZ_HT_FLAG; cptr = store; old_Csiz = 4; - Rsiz = 0x4000; //for jph, bit 14 of Rsiz is 1 } ~param_siz() @@ -255,6 +269,7 @@ namespace ojph { ui32 t = ojph_div_ceil(Xsiz, ds) - ojph_div_ceil(XOsiz, ds); return t; } + ui32 get_height(ui32 comp_num) const { assert(comp_num < get_num_components()); @@ -273,6 +288,11 @@ namespace ojph { bool is_ws_kern_support_needed() { return ws_kern_support_needed; } bool is_dfs_support_needed() { return dfs_support_needed; } + void set_Rsiz_flag(ui16 flag) + { Rsiz |= flag; } + void reset_Rsiz_flag(ui16 flag) + { Rsiz = (ui16)(Rsiz & ~flag); } + private: ui16 Lsiz; ui16 Rsiz; @@ -503,6 +523,9 @@ namespace ojph { return (Scod & 4) == 4; } + //////////////////////////////////////// + ui32 propose_implementation_precision(const param_siz* siz) const; + //////////////////////////////////////// bool write(outfile_base *file); @@ -626,7 +649,11 @@ namespace ojph { bool is_employing_color_transform); void set_irrev_quant(ui32 num_decomps); - protected: + ui8 decode_SPqcd(ui8 v) const + { return (ui8)(v >> 3); } + ui8 encode_SPqcd(ui8 v) const + { return (ui8)(v << 3); } + protected: ui16 Lqcd; ui8 Sqcd; union @@ -659,6 +686,64 @@ namespace ojph { ui16 comp_idx; }; + /////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + /////////////////////////////////////////////////////////////////////////// + // data structures used by param_nlt + struct param_nlt + { + using special_comp_num = ojph::param_nlt::special_comp_num; + public: + param_nlt() { + Lnlt = 6; + Cnlt = special_comp_num::ALL_COMPS; // default + BDnlt = 0; + Tnlt = 3; + enabled = false; next = NULL; alloced_next = false; + } + + ~param_nlt() { + if (next && alloced_next) { + delete next; + alloced_next = false; + next = NULL; + } + } + + void check_validity(param_siz& siz); + void set_type3_transformation(ui32 comp_num, bool enable); + bool get_type3_transformation(ui32 comp_num, ui8& bit_depth, + bool& is_signed) const; + bool write(outfile_base* file) const; + void read(infile_base* file); + + private: + const param_nlt* get_comp_object(ui32 comp_num) const; + param_nlt* get_comp_object(ui32 comp_num); + param_nlt* add_object(ui32 comp_num); + bool is_any_enabled() const; + void trim_non_existing_components(ui32 num_comps); + + private: + ui16 Lnlt; // length of the marker segment excluding marker + ui16 Cnlt; // Component involved in the transformation + ui8 BDnlt; // Decoded image component bit depth parameter + ui8 Tnlt; // Type of non-linearity + bool enabled; // true if this object is used + param_nlt* next; // for chaining NLT markers + bool alloced_next; // true if next was allocated, not just set to an + // existing object + + // The top level param_nlt object is not allocated, but as part of + // codestream, and is used to manage allocated next objects. + // next holds a list of param_nlt objects, which are managed by the top + // param_nlt object. + }; + /////////////////////////////////////////////////////////////////////////// // // @@ -792,9 +877,10 @@ namespace ojph { }; public: // member functions - param_dfs() { memset(this, 0, sizeof(param_dfs)); } + param_dfs() { init(); } ~param_dfs() { if (next) delete next; } - void init() { memset(this, 0, sizeof(param_dfs)); } + void init() + { Ldfs = Sdfs = Ids = 0; memset(Ddfs, 0, sizeof(Ddfs)); next = NULL; } bool read(infile_base *file); bool exists() const { return Ldfs != 0; } @@ -869,8 +955,17 @@ namespace ojph { bool read_coefficient(infile_base *file, float &K); bool read_coefficient(infile_base *file, si16 &K); void init(bool clear_all = true) { - if (clear_all) - memset(this, 0, sizeof(param_atk)); + if (clear_all) + { + Latk = Satk = 0; + Katk = 0.0f; + Natk = 0; + d = NULL; + max_steps = 0; + memset(d_store, 0, sizeof(d_store)); + next = NULL; + alloced_next = false; + } d = d_store; max_steps = sizeof(d_store) / sizeof(lifting_step); } void init_irv97(); diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp index 813e33b..803790d 100644 --- a/src/core/codestream/ojph_precinct.cpp +++ b/src/core/codestream/ojph_precinct.cpp @@ -221,7 +221,9 @@ namespace ojph { { int num_zeros = *mmsb_tag.get(x>>levm1, y>>levm1, levm1); num_zeros -= *mmsb_tag.get(x>>cur_lev, y>>cur_lev, cur_lev); - bb_put_bits(&bb, 1, num_zeros + 1, + bb_put_zeros(&bb, num_zeros, + elastic, cur_coded_list, ph_bytes); + bb_put_bits(&bb, 1, 1, elastic, cur_coded_list, ph_bytes); *mmsb_tag_flags.get(x>>levm1, y>>levm1, levm1) = 1; } diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp index b82a810..0246400 100644 --- a/src/core/codestream/ojph_resolution.cpp +++ b/src/core/codestream/ojph_resolution.cpp @@ -199,6 +199,9 @@ namespace ojph { allocator->pre_alloc_obj((size_t)num_precincts.area()); } + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + //allocate lines if (skipped_res_for_recon == false) { @@ -207,10 +210,19 @@ namespace ojph { allocator->pre_alloc_obj(num_steps + 2); ui32 width = res_rect.siz.w + 1; - for (ui32 i = 0; i < num_steps; ++i) + if (precision <= 32) { + for (ui32 i = 0; i < num_steps; ++i) + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); + } + else + { + for (ui32 i = 0; i < num_steps; ++i) + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + } } } @@ -245,8 +257,8 @@ namespace ojph { const param_dfs* dfs = codestream->access_dfs(); if (dfs == NULL) { OJPH_ERROR(0x00070011, "There is a problem with codestream " - "marker segments. COD/COC specifies the use of a DFS marker " - "but there are no DFS markers within the main codestream " + "marker segments. COD/COC specifies the use of a DFS marker " + "but there are no DFS markers within the main codestream " "headers"); } else { @@ -436,6 +448,9 @@ namespace ojph { level_index[i] = level_index[i - 1] + val; cur_precinct_loc = point(0, 0); + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + //allocate lines if (skipped_res_for_recon == false) { @@ -460,11 +475,22 @@ namespace ojph { // initiate storage of line_buf ui32 width = res_rect.siz.w + 1; - for (ui32 i = 0; i < num_steps; ++i) - ssp[i].line->wrap( - allocator->post_alloc_data(width, 1), width, 1); - sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); - aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + if (precision <= 32) + { + for (ui32 i = 0; i < num_steps; ++i) + ssp[i].line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + } + else + { + for (ui32 i = 0; i < num_steps; ++i) + ssp[i].line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + } cur_line = 0; rows_to_produce = res_rect.siz.h; @@ -682,8 +708,9 @@ namespace ojph { rev_horz_syn(atk, aug->line, child_res->pull_line(), bands[1].pull_line(), width, horz_even); else - memcpy(aug->line->i32, child_res->pull_line()->i32, - width * sizeof(si32)); + memcpy(aug->line->p, child_res->pull_line()->p, + (size_t)width + * (aug->line->flags & line_buf::LFT_SIZE_MASK)); aug->active = true; vert_even = !vert_even; ++cur_line; @@ -694,8 +721,9 @@ namespace ojph { rev_horz_syn(atk, sig->line, bands[2].pull_line(), bands[3].pull_line(), width, horz_even); else - memcpy(sig->line->i32, bands[2].pull_line()->i32, - width * sizeof(si32)); + memcpy(sig->line->p, bands[2].pull_line()->p, + (size_t)width + * (sig->line->flags & line_buf::LFT_SIZE_MASK)); sig->active = true; vert_even = !vert_even; ++cur_line; @@ -733,8 +761,9 @@ namespace ojph { rev_horz_syn(atk, aug->line, child_res->pull_line(), bands[1].pull_line(), width, horz_even); else - memcpy(aug->line->i32, child_res->pull_line()->i32, - width * sizeof(si32)); + memcpy(aug->line->p, child_res->pull_line()->p, + (size_t)width + * (aug->line->flags & line_buf::LFT_SIZE_MASK)); } else { @@ -742,11 +771,22 @@ namespace ojph { rev_horz_syn(atk, aug->line, bands[2].pull_line(), bands[3].pull_line(), width, horz_even); else - memcpy(aug->line->i32, bands[2].pull_line()->i32, - width * sizeof(si32)); - si32* sp = aug->line->i32; - for (ui32 i = width; i > 0; --i) - *sp++ >>= 1; + memcpy(aug->line->p, bands[2].pull_line()->p, + (size_t)width + * (aug->line->flags & line_buf::LFT_SIZE_MASK)); + if (aug->line->flags & line_buf::LFT_32BIT) + { + si32* sp = aug->line->i32; + for (ui32 i = width; i > 0; --i) + *sp++ >>= 1; + } + else + { + assert(aug->line->flags & line_buf::LFT_64BIT); + si64* sp = aug->line->i64; + for (ui32 i = width; i > 0; --i) + *sp++ >>= 1; + } } return aug->line; } @@ -854,8 +894,8 @@ namespace ojph { rev_horz_syn(atk, aug->line, child_res->pull_line(), bands[1].pull_line(), width, horz_even); else - memcpy(aug->line->i32, child_res->pull_line()->i32, - width * sizeof(si32)); + memcpy(aug->line->p, child_res->pull_line()->p, + (size_t)width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); return aug->line; } else diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h index 635a4ce..6156455 100644 --- a/src/core/codestream/ojph_resolution.h +++ b/src/core/codestream/ojph_resolution.h @@ -45,7 +45,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_elastic_allocator; class codestream; diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp index cf007fc..8efc8de 100644 --- a/src/core/codestream/ojph_subband.cpp +++ b/src/core/codestream/ojph_subband.cpp @@ -91,13 +91,18 @@ namespace ojph { allocator->pre_alloc_obj((size_t)num_blocks.area()); for (ui32 i = 0; i < num_blocks.w; ++i) - codeblock::pre_alloc(codestream, nominal); + codeblock::pre_alloc(codestream, comp_num, nominal); //allocate lines allocator->pre_alloc_obj(1); //allocate line_buf ui32 width = band_rect.siz.w + 1; - allocator->pre_alloc_data(width, 1); + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + if (precision <= 32) + allocator->pre_alloc_data(width, 1); + else + allocator->pre_alloc_data(width, 1); } ////////////////////////////////////////////////////////////////////////// @@ -192,7 +197,12 @@ namespace ojph { lines = allocator->post_alloc_obj(1); //allocate line_buf ui32 width = band_rect.siz.w + 1; - lines->wrap(allocator->post_alloc_data(width,1),width,1); + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + if (precision <= 32) + lines->wrap(allocator->post_alloc_data(width, 1), width, 1); + else + lines->wrap(allocator->post_alloc_data(width, 1), width, 1); } ////////////////////////////////////////////////////////////////////////// @@ -256,10 +266,11 @@ namespace ojph { if (empty) return; - assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size); - si32* t = lines[0].i32; - lines[0].i32 = l->i32; - l->i32 = t; + assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size && + l->flags == lines[0].flags); + void* p = lines[0].p; + lines[0].p = l->p; + l->p = p; } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h index 8cadae0..e1c291a 100644 --- a/src/core/codestream/ojph_subband.h +++ b/src/core/codestream/ojph_subband.h @@ -45,7 +45,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_elastic_allocator; class codestream; @@ -94,6 +94,8 @@ namespace ojph { bool exists() { return !empty; } line_buf* pull_line(); + resolution* get_parent() { return parent; } + const resolution* get_parent() const { return parent; } private: bool empty; // true if the subband has no pixels or diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 3be907d..4755bb4 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -67,6 +67,7 @@ namespace ojph { allocator->pre_alloc_obj(num_comps); //for line_offsets allocator->pre_alloc_obj(num_comps); //for num_bits allocator->pre_alloc_obj(num_comps); //for is_signed + allocator->pre_alloc_obj(num_comps); //for nlt_type3 allocator->pre_alloc_obj(num_comps); //for cur_line ui32 tilepart_div = codestream->get_tilepart_div(); @@ -142,6 +143,7 @@ namespace ojph { //allocate tiles_comp const param_siz *szp = codestream->get_siz(); + const param_nlt *nlp = codestream->get_nlt(); this->num_bytes = 0; num_comps = szp->get_num_components(); @@ -152,6 +154,7 @@ namespace ojph { line_offsets = allocator->post_alloc_obj(num_comps); num_bits = allocator->post_alloc_obj(num_comps); is_signed = allocator->post_alloc_obj(num_comps); + nlt_type3 = allocator->post_alloc_obj(num_comps); cur_line = allocator->post_alloc_obj(num_comps); profile = codestream->get_profile(); @@ -176,6 +179,8 @@ namespace ojph { ui32 width = 0; for (ui32 i = 0; i < num_comps; ++i) { + ui8 bd; bool is; // used for nlt_type3 + point downsamp = szp->get_downsampling(i); point recon_downsamp = szp->get_recon_downsampling(i); @@ -205,6 +210,13 @@ namespace ojph { num_bits[i] = szp->get_bit_depth(i); is_signed[i] = szp->is_signed(i); + nlt_type3[i] = nlp->get_type3_transformation(i, bd, is); + if (nlt_type3[i] == true && (bd != num_bits[i] || is != is_signed[i])) + OJPH_ERROR(0x000300A1, "Mismatch between Ssiz (bit_depth = %d, " + "is_signed = %s) from SIZ marker segment, and BDnlt " + "(bit_depth = %d, is_signed = %s) from NLT marker segment, " + "for component %d",i, num_bits[i], + is_signed[i] ? "True" : "False", bd, is ? "True" : "False"); cur_line[i] = 0; } @@ -219,8 +231,7 @@ namespace ojph { num_lines = 3; lines = allocator->post_alloc_obj(num_lines); for (int i = 0; i < 3; ++i) - lines[i].wrap( - allocator->post_alloc_data(width,0),width,0); + lines[i].wrap(allocator->post_alloc_data(width, 0), width, 0); } else { @@ -247,13 +258,15 @@ namespace ojph { line_buf *tc = comps[comp_num].get_line(); if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp = line->i32 + line_offsets[comp_num]; - si32* dp = tc->i32; - if (is_signed[comp_num]) - memcpy(dp, sp, comp_width * sizeof(si32)); - else - cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); + si64 shift = (si64)1 << (num_bits[comp_num] - 1); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(line, line_offsets[comp_num], + tc, 0, shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : -shift; + rev_convert(line, line_offsets[comp_num], tc, 0, + shift, comp_width); + } } else { @@ -269,22 +282,25 @@ namespace ojph { } else { + si64 shift = (si64)1 << (num_bits[comp_num] - 1); ui32 comp_width = comp_rects[comp_num].siz.w; if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp = line->i32 + line_offsets[comp_num]; - si32 *dp = lines[comp_num].i32; - if (is_signed[comp_num]) - memcpy(dp, sp, comp_width * sizeof(si32)); - else - cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(line, line_offsets[comp_num], + lines + comp_num, 0, shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : -shift; + rev_convert(line, line_offsets[comp_num], lines + comp_num, 0, + shift, comp_width); + } + if (comp_num == 2) { // reversible color transform - rct_forward(lines[0].i32, lines[1].i32, lines[2].i32, - comps[0].get_line()->i32, - comps[1].get_line()->i32, - comps[2].get_line()->i32, comp_width); + rct_forward(lines + 0, lines + 1, lines + 2, + comps[0].get_line(), + comps[1].get_line(), + comps[2].get_line(), comp_width); comps[0].push_line(); comps[1].push_line(); comps[2].push_line(); @@ -330,13 +346,15 @@ namespace ojph { ui32 comp_width = recon_comp_rects[comp_num].siz.w; if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp = src_line->i32; - si32* dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) - memcpy(dp, sp, comp_width * sizeof(si32)); - else - cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width); + si64 shift = (si64)1 << (num_bits[comp_num] - 1); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(src_line, 0, tgt_line, + line_offsets[comp_num], shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : shift; + rev_convert(src_line, 0, tgt_line, + line_offsets[comp_num], shift, comp_width); + } } else { @@ -356,9 +374,9 @@ namespace ojph { if (comp_num == 0) { if (reversible) - rct_backward(comps[0].pull_line()->i32, comps[1].pull_line()->i32, - comps[2].pull_line()->i32, lines[0].i32, lines[1].i32, - lines[2].i32, comp_width); + rct_backward(comps[0].pull_line(), comps[1].pull_line(), + comps[2].pull_line(), lines + 0, lines + 1, + lines + 2, comp_width); else ict_backward(comps[0].pull_line()->f32, comps[1].pull_line()->f32, comps[2].pull_line()->f32, lines[0].f32, lines[1].f32, @@ -366,17 +384,20 @@ namespace ojph { } if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp; + si64 shift = (si64)1 << (num_bits[comp_num] - 1); + line_buf* src_line; if (comp_num < 3) - sp = lines[comp_num].i32; - else - sp = comps[comp_num].pull_line()->i32; - si32* dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) - memcpy(dp, sp, comp_width * sizeof(si32)); + src_line = lines + comp_num; else - cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width); + src_line = comps[comp_num].pull_line(); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(src_line, 0, tgt_line, + line_offsets[comp_num], shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : shift; + rev_convert(src_line, 0, tgt_line, + line_offsets[comp_num], shift, comp_width); + } } else { diff --git a/src/core/codestream/ojph_tile.h b/src/core/codestream/ojph_tile.h index 056c7c9..6b65a13 100644 --- a/src/core/codestream/ojph_tile.h +++ b/src/core/codestream/ojph_tile.h @@ -47,7 +47,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class codestream; namespace local { @@ -89,6 +89,7 @@ namespace ojph { ui32 *num_bits; bool *is_signed; ui32 *cur_line; + bool *nlt_type3; int prog_order; private: diff --git a/src/core/codestream/ojph_tile_comp.h b/src/core/codestream/ojph_tile_comp.h index def39e5..62b8fba 100644 --- a/src/core/codestream/ojph_tile_comp.h +++ b/src/core/codestream/ojph_tile_comp.h @@ -48,7 +48,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class codestream; namespace local { diff --git a/src/core/coding/ojph_block_common.cpp b/src/core/coding/ojph_block_common.cpp index e6b4de6..2ba138a 100644 --- a/src/core/coding/ojph_block_common.cpp +++ b/src/core/coding/ojph_block_common.cpp @@ -84,11 +84,20 @@ namespace ojph { * + 4 * mel event for initial row of quads when needed \n * \n * Each entry contains, starting from the LSB \n - * \li \c total prefix length for quads 0 and 1 (3 bits) \n - * \li \c total suffix length for quads 0 and 1 (4 bits) \n + * \li \c total total prefix length for quads 0 and 1 (3 bits) \n + * \li \c total total suffix length for quads 0 and 1 (4 bits) \n * \li \c suffix length for quad 0 (3 bits) \n * \li \c prefix for quad 0 (3 bits) \n * \li \c prefix for quad 1 (3 bits) \n + * \n + * Another table is uvlc_bias, which is needed to correctly decode the + * extension u_ext for initial row of quads. Under certain condition, + * we deduct 1 or 2 from u_q0 and u_q1 before encoding them; so for us + * to know that decoding u_ext is needed, we recreate the u_q0 and u_q1 + * that we actually encoded. \n + * For simplicity, we use the same index as before \n + * \li \c u_q0 bias is 2 bits \n + * \li \c u_q1 bias is 2 bits \n */ /// @brief uvlc_tbl0 contains decoding information for initial row of quads @@ -96,6 +105,8 @@ namespace ojph { /// @brief uvlc_tbl1 contains decoding information for non-initial row of /// quads ui16 uvlc_tbl1[256] = { 0 }; + /// @brief uvlc_bias contains decoding info. for initial row of quads + ui8 uvlc_bias[256+64] = { 0 }; /// @} //************************************************************************/ @@ -199,8 +210,10 @@ namespace ojph { ui32 mode = i >> 6; ui32 vlc = i & 0x3F; - if (mode == 0) // both u_off are 0 + if (mode == 0) { // both u_off are 0 uvlc_tbl0[i] = 0; + uvlc_bias[i] = 0; + } else if (mode <= 2) // u_off are either 01 or 10 { ui32 d = dec[vlc & 0x7]; //look at the least significant 3 bits @@ -232,6 +245,7 @@ namespace ojph { total_suffix = u0_suffix_len; u0 = d0 >> 5; u1 = (vlc & 1) + 1; + uvlc_bias[i] = 4; // 0b00 for u0 and 0b01 for u1 } else { @@ -240,6 +254,7 @@ namespace ojph { total_suffix = u0_suffix_len + ((d1 >> 2) & 0x7); u0 = d0 >> 5; u1 = d1 >> 5; + uvlc_bias[i] = 0; } uvlc_tbl0[i] = (ui16)(total_prefix | @@ -265,6 +280,7 @@ namespace ojph { (u0_suffix_len << 7) | (u0 << 10) | (u1 << 13)); + uvlc_bias[i] = 10; // 0b10 for u0 and 0b10 for u1 } } diff --git a/src/core/coding/ojph_block_common.h b/src/core/coding/ojph_block_common.h index 29a84ba..f8d6503 100644 --- a/src/core/coding/ojph_block_common.h +++ b/src/core/coding/ojph_block_common.h @@ -44,6 +44,6 @@ namespace ojph{ extern ui16 vlc_tbl1[1024]; extern ui16 uvlc_tbl0[256+64]; extern ui16 uvlc_tbl1[256]; - + extern ui8 uvlc_bias[256+64]; } // !namespace local } // !namespace ojph diff --git a/src/core/coding/ojph_block_decoder.h b/src/core/coding/ojph_block_decoder.h index dcd3220..a197017 100644 --- a/src/core/coding/ojph_block_decoder.h +++ b/src/core/coding/ojph_block_decoder.h @@ -50,7 +50,12 @@ namespace ojph { // generic decoder bool - ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data, + ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, bool stripe_causal); + + bool + ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal); @@ -60,6 +65,12 @@ namespace ojph { ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal); + // AVX2-accelerated decoder + bool + ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, bool stripe_causal); + // WASM SIMD-accelerated decoder bool ojph_decode_codeblock_wasm(ui8* coded_data, ui32* decoded_data, diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder32.cpp similarity index 98% rename from src/core/coding/ojph_block_decoder.cpp rename to src/core/coding/ojph_block_decoder32.cpp index 5be5430..f54c77e 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder32.cpp @@ -739,11 +739,11 @@ namespace ojph { * @param [in] stride is the decoded codeblock buffer stride * @param [in] stripe_causal is true for stripe causal mode */ - bool ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data, - ui32 missing_msbs, ui32 num_passes, - ui32 lengths1, ui32 lengths2, - ui32 width, ui32 height, ui32 stride, - bool stripe_causal) + bool ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) { static bool insufficient_precision = false; static bool modify_code = false; @@ -753,14 +753,14 @@ namespace ojph { { OJPH_WARN(0x00010001, "A malformed codeblock that has more than " "one coding pass, but zero length for " - "2nd and potential 3rd pass"); + "2nd and potential 3rd pass."); num_passes = 1; } if (num_passes > 3) { OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " - "This codeblocks has %d passes", + "This codeblocks has %d passes.", num_passes); return false; } @@ -772,7 +772,7 @@ namespace ojph { insufficient_precision = true; OJPH_WARN(0x00010003, "32 bits are not enough to decode this " "codeblock. This message will not be " - "displayed again"); + "displayed again."); } return false; } @@ -783,7 +783,7 @@ namespace ojph { OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " "pass. The code can be modified to support " "this case. This message will not be " - "displayed again"); + "displayed again."); } return false; // 32 bits are not enough to decode this } @@ -796,7 +796,7 @@ namespace ojph { OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " "nor MagRef passes; both will be skipped. " "This message will not be displayed " - "again"); + "again."); } } } @@ -806,7 +806,7 @@ namespace ojph { if (lengths1 < 2) { - OJPH_WARN(0x00010006, "Wrong codeblock length"); + OJPH_WARN(0x00010006, "Wrong codeblock length."); return false; } @@ -1079,7 +1079,7 @@ namespace ojph { // quad 0 length len = uvlc_entry & 0x7; // quad 0 suffix length uvlc_entry >>= 3; - ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q + ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); sp[1] = u_q; u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q sp[3] = u_q; @@ -1217,7 +1217,7 @@ namespace ojph { ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1? ui32 emax = vp[0] | vp[1]; - emax = 31 - count_leading_zeros(emax | 2); // emax - 1 + emax = 31 - count_leading_zeros(emax | 2); // emax - 1 ui32 kappa = gamma ? emax : 1; ui32 U_q = u_q + kappa; @@ -1613,4 +1613,4 @@ namespace ojph { return true; } } -} +} \ No newline at end of file diff --git a/src/core/coding/ojph_block_decoder64.cpp b/src/core/coding/ojph_block_decoder64.cpp new file mode 100644 index 0000000..8801735 --- /dev/null +++ b/src/core/coding/ojph_block_decoder64.cpp @@ -0,0 +1,1663 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2019, The University of New South Wales, Australia +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_block_decoder.cpp +// Author: Aous Naman +// Date: 13 May 2022 +//***************************************************************************/ + +//***************************************************************************/ +/** @file ojph_block_decoder.cpp + * @brief implements a HTJ2K block decoder + */ + +#include +#include + +#include +#include +#include "ojph_block_common.h" +#include "ojph_block_decoder.h" +#include "ojph_arch.h" +#include "ojph_message.h" + +namespace ojph { + namespace local { + + //************************************************************************/ + /** @brief MEL state structure for reading and decoding the MEL bitstream + * + * A number of events is decoded from the MEL bitstream ahead of time + * and stored in run/num_runs. + * Each run represents the number of zero events before a one event. + */ + struct dec_mel_st { + dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false), + k(0), num_runs(0), runs(0) + {} + // data decoding machinery + ui8* data; //!bits > 32) //there are enough bits in the tmp variable + return; // return without reading new data + + ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted + if (melp->size > 4) { // if there is data in the MEL segment + val = *(ui32*)melp->data; // read 32 bits from MEL data + melp->data += 4; // advance pointer + melp->size -= 4; // reduce counter + } + else if (melp->size > 0) + { // 4 or less + int i = 0; + while (melp->size > 1) { + ui32 v = *melp->data++; // read one byte at a time + ui32 m = ~(0xFFu << i); // mask of location + val = (val & m) | (v << i);// put one byte in its correct location + --melp->size; + i += 8; + } + // size equal to 1 + ui32 v = *melp->data++; // the one before the last is different + v |= 0xF; // MEL and VLC segments can overlap + ui32 m = ~(0xFFu << i); + val = (val & m) | (v << i); + --melp->size; + } + + // next we unstuff them before adding them to the buffer + int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if + // the previously read byte requires + // unstuffing + + // data is unstuffed and accumulated in t + // bits has the number of bits in t + ui32 t = val & 0xFF; + bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing + bits -= unstuff; // there is one less bit in t if unstuffing is needed + t = t << (8 - unstuff); // move up to make room for the next byte + + //this is a repeat of the above + t |= (val>>8) & 0xFF; + unstuff = (((val >> 8) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>16) & 0xFF; + unstuff = (((val >> 16) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>24) & 0xFF; + melp->unstuff = (((val >> 24) & 0xFF) == 0xFF); + + // move t to tmp, and push the result all the way up, so we read from + // the MSB + melp->tmp |= ((ui64)t) << (64 - bits - melp->bits); + melp->bits += bits; //increment the number of bits in tmp + } + + //************************************************************************/ + /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs + * + * Runs are stored in "runs" and the number of runs in "num_runs". + * Each run represents a number of zero events that may or may not + * terminate in a 1 event. + * Each run is stored in 7 bits. The LSB is 1 if the run terminates in + * a 1 event, 0 otherwise. The next 6 bits, for the case terminating + * with 1, contain the number of consecutive 0 zero events * 2; for the + * case terminating with 0, they store (number of consecutive 0 zero + * events - 1) * 2. + * A total of 6 bits (made up of 1 + 5) should have been enough. + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + void mel_decode(dec_mel_st *melp) + { + static const int mel_exp[13] = { //MEL exponents + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5 + }; + + if (melp->bits < 6) // if there are less than 6 bits in tmp + mel_read(melp); // then read from the MEL bitstream + // 6 bits is the largest decodable MEL cwd + + //repeat so long that there is enough decodable bits in tmp, + // and the runs store is not full (num_runs < 8) + while (melp->bits >= 6 && melp->num_runs < 8) + { + int eval = mel_exp[melp->k]; // number of bits associated with state + int run = 0; + if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB) + { //one is found + run = 1 << eval; + run--; // consecutive runs of 0 events - 1 + melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12 + melp->tmp <<= 1; // consume one bit from tmp + melp->bits -= 1; + run = run << 1; // a stretch of zeros not terminating in one + } + else + { //0 is found + run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1); + melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0 + melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6) + melp->bits -= eval + 1; + run = (run << 1) + 1; // a stretch of zeros terminating with one + } + eval = melp->num_runs * 7; // 7 bits per run + melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient + melp->runs |= ((ui64)run) << eval; // store the value in runs + melp->num_runs++; // increment count + } + } + + //************************************************************************/ + /** @brief Initiates a dec_mel_st structure for MEL decoding and reads + * some bytes in order to get the read address to a multiple + * of 4 + * + * @param [in] melp is a pointer to dec_mel_st structure + * @param [in] bbuf is a pointer to byte buffer + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup) + { + melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL + melp->bits = 0; // 0 bits in tmp + melp->tmp = 0; // + melp->unstuff = false; // no unstuffing + melp->size = scup - 1; // size is the length of MEL+VLC-1 + melp->k = 0; // 0 for state + melp->num_runs = 0; // num_runs is 0 + melp->runs = 0; // + + //This code is borrowed; original is for a different architecture + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment + int num = 4 - (int)(intptr_t(melp->data) & 0x3); + for (int i = 0; i < num; ++i) { // this code is similar to mel_read + assert(melp->unstuff == false || melp->data[0] <= 0x8F); + ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed + //set data to 0xFF + if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF + // see the standard + melp->data += melp->size-- > 0; //increment if the end is not reached + int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1 + melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp + melp->bits += d_bits; //increment tmp by number of bits + melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs + //unstuffing + } + melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit + // is the MSB + } + + //************************************************************************/ + /** @brief Retrieves one run from dec_mel_st; if there are no runs stored + * MEL segment is decoded + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + int mel_get_run(dec_mel_st *melp) + { + if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment + mel_decode(melp); + + int t = melp->runs & 0x7F; //retrieve one run + melp->runs >>= 7; // remove the retrieved run + melp->num_runs--; + return t; // return run + } + + //************************************************************************/ + /** @brief A structure for reading and unstuffing a segment that grows + * backward, such as VLC and MRP + */ + struct rev_struct { + rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false) + {} + //storage + ui8* data; //!size > 0) // if there are more than 3 bytes left in VLC + { + val = *vlcp->data; // then read 8 bits + --vlcp->data; // increment data pointer + --vlcp->size; // decrement number of bytes in the buffer + } + + // accumulate in tmp, and increment bits, check if unstuffing is needed + ui8 t = (vlcp->unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0; + val = (ui8)(val & (0xFFU >> t)); // protect against erroneous 1 in MSB + vlcp->tmp |= (ui64)val << vlcp->bits; + vlcp->bits += 8 - t; + vlcp->unstuff = val > 0x8F; + } + + //************************************************************************/ + /** @brief Initiates the rev_struct structure and reads the first byte + * + * This subroutine initializes the VLC decoder. It discards the first + * 12 bits (they have the sum of the lengths of VLC and MEL segments), + * and depending on unstuffing, stores 3 or 4 bits in the unstuffed + * decoded buffer. + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void rev_init8(rev_struct *vlcp, ui8* data, int lcup, int scup) + { + //first byte has only the upper 4 bits + vlcp->data = data + lcup - 2; + + //size can not be larger than this, in fact it should be smaller + vlcp->size = scup - 2; + + ui8 val = *vlcp->data--; // read one byte (this is a half byte) + + // the first byte is treated different to other bytes, because only + // the MSB nibble is part of the VLC code. + val = (ui8)(val >> 4); + ui8 t = ((val & 0x7) == 0x7) ? 1 : 0; // unstuffing is needed + val = (ui8)(val & (0xFU >> t)); // protect against erroneous 1 in MSB + vlcp->tmp = val; + vlcp->bits = 4 - t; + vlcp->unstuff = val > 0x8; //this is useful for the next byte + } + + //************************************************************************/ + /** @brief Fills the temporary variable (vlcp->tmp) by up to 64 bits + * + * By the end of this call, vlcp->tmp must have no less than 56 bits + * + * @param [in] vlcp is a pointer to rev_struct structure + */ + static inline + ui64 rev_fetch64(rev_struct *vlcp) + { + while (vlcp->bits <= 56) + rev_read8(vlcp); // read 8 bits, but unstuffing might reduce this + return vlcp->tmp; // return unstuff decoded bits + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui64 rev_advance64(rev_struct *vlcp, ui32 num_bits) + { + assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits + vlcp->tmp >>= num_bits; // remove bits + vlcp->bits -= num_bits; // decrement the number of bits + return vlcp->tmp; + } + + //************************************************************************/ + /** @brief Reads and unstuffs from rev_struct + * + * This is different than rev_read in that this fills in zeros when the + * the available data is consumed. The other does not care about the + * values when all data is consumed. + * + * See rev_read for more information about unstuffing + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + void rev_read_mrp(rev_struct *mrp) + { + //process 4 bytes at a time + if (mrp->bits > 32) + return; + ui32 val = 0; + if (mrp->size > 3) // If there are 3 byte or more + { // (mrp->data - 3) move pointer back to read 32 bits at once + val = *(ui32*)(mrp->data - 3); // read 32 bits + mrp->data -= 4; // move back pointer + mrp->size -= 4; // reduce count + } + else if (mrp->size > 0) + { + int i = 24; + while (mrp->size > 0) { + ui32 v = *mrp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --mrp->size; + i -= 8; + } + } + + //accumulate in tmp, and keep count in bits + ui32 bits, tmp = val >> 24; + + //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F + bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0); + bool unstuff = (val >> 24) > 0x8F; + + //process the next byte + tmp |= ((val >> 16) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 16) & 0xFF) > 0x8F; + + tmp |= ((val >> 8) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 8) & 0xFF) > 0x8F; + + tmp |= (val & 0xFF) << bits; + bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = (val & 0xFF) > 0x8F; + + mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer + mrp->bits += bits; + mrp->unstuff = unstuff; // next byte + } + + //************************************************************************/ + /** @brief Initialized rev_struct structure for MRP segment, and reads + * a number of bytes such that the next 32 bits read are from + * an address that is a multiple of 4. Note this is designed for + * an architecture that read size must be compatible with the + * alignment of the read address + * + * There is another similar subroutine rev_init. This subroutine does + * NOT skip the first 12 bits, and starts with unstuff set to true. + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] len2 is the length of SPP+MRP segments + */ + static inline + void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2) + { + mrp->data = data + lcup + len2 - 1; + mrp->size = len2; + mrp->unstuff = true; + mrp->bits = 0; + mrp->tmp = 0; + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream + int num = 1 + (int)(intptr_t(mrp->data) & 0x3); + for (int i = 0; i < num; ++i) { + ui64 d; + //read a byte, 0 if no more data + d = (mrp->size-- > 0) ? *mrp->data-- : 0; + //check if unstuffing is needed + ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp + mrp->bits += d_bits; + mrp->unstuff = d > 0x8F; // for next byte + } + rev_read_mrp(mrp); + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, mrp->tmp must have no less than 33 bits + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch_mrp(rev_struct *mrp) + { + if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp + { + rev_read_mrp(mrp); // read 30-32 bits from mrp + if (mrp->bits < 32) // if there is a space of 32 bits + rev_read_mrp(mrp); // read more + } + return (ui32)mrp->tmp; // return the head of mrp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits) + { + assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits + mrp->tmp >>= num_bits; // discard the lowest num_bits bits + mrp->bits -= num_bits; + return (ui32)mrp->tmp; // return data after consumption + } + + //************************************************************************/ + /** @brief State structure for reading and unstuffing of forward-growing + * bitstreams; these are: MagSgn and SPP bitstreams + */ + struct frwd_struct { + const ui8* data; //! + static inline + void frwd_read(frwd_struct *msp) + { + assert(msp->bits <= 32); // assert that there is a space for 32 bits + + ui32 val = 0; + if (msp->size > 3) { + val = *(ui32*)msp->data; // read 32 bits + msp->data += 4; // increment pointer + msp->size -= 4; // reduce size + } + else if (msp->size > 0) + { + int i = 0; + val = X != 0 ? 0xFFFFFFFFu : 0; + while (msp->size > 0) { + ui32 v = *msp->data++; // read one byte at a time + ui32 m = ~(0xFFu << i); // mask of location + val = (val & m) | (v << i);// put one byte in its correct location + --msp->size; + i += 8; + } + } + else + val = X != 0 ? 0xFFFFFFFFu : 0; + + // we accumulate in t and keep a count of the number of bits in bits + ui32 bits = 8 - msp->unstuff; + ui32 t = val & 0xFF; + bool unstuff = ((val & 0xFF) == 0xFF); // Do we need unstuffing next? + + t |= ((val >> 8) & 0xFF) << bits; + bits += 8 - unstuff; + unstuff = (((val >> 8) & 0xFF) == 0xFF); + + t |= ((val >> 16) & 0xFF) << bits; + bits += 8 - unstuff; + unstuff = (((val >> 16) & 0xFF) == 0xFF); + + t |= ((val >> 24) & 0xFF) << bits; + bits += 8 - unstuff; + msp->unstuff = (((val >> 24) & 0xFF) == 0xFF); // for next byte + + msp->tmp |= ((ui64)t) << msp->bits; // move data to msp->tmp + msp->bits += bits; + } + + //************************************************************************/ + /** @brief Read and unstuffs 8 bits from forward-growing bitstream + * + * A template is used to accommodate a different requirement for + * MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is + * consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in. + * X controls this value. + * + * Unstuffing prevent sequences that are more than 0xFF7F from appearing + * in the conpressed sequence. So whenever a value of 0xFF is coded, the + * MSB of the next byte is set 0 and must be ignored during decoding. + * + * @tparam X is the value fed in when the bitstream is exhausted + * @param [in] msp is a pointer to frwd_struct structure + * + */ + template + static inline + void frwd_read8(frwd_struct *msp) + { + ui8 val = X; + if (msp->size > 0) { + val = *msp->data; // read 8 bits + ++msp->data; // increment pointer + --msp->size; // reduce size + } + + // unstuff and accumulate + ui8 t = msp->unstuff ? 1 : 0; + val = (ui8)(val & (0xFFU >> t)); + msp->unstuff = (val == 0xFF); + msp->tmp |= ((ui64)val) << msp->bits; // move data to msp->tmp + msp->bits += 8 - t; + } + + //************************************************************************/ + /** @brief Initialize frwd_struct struct and reads some bytes + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + * @param [in] data is a pointer to the start of data + * @param [in] size is the number of byte in the bitstream + */ + template + static inline + void frwd_init(frwd_struct *msp, const ui8* data, int size) + { + msp->data = data; + msp->tmp = 0; + msp->bits = 0; + msp->unstuff = 0; + msp->size = size; + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the bitstream + int num = 4 - (int)(intptr_t(msp->data) & 0x3); + for (int i = 0; i < num; ++i) + { + ui64 d; + //read a byte if the buffer is not exhausted, otherwise set it to X + d = msp->size-- > 0 ? *msp->data++ : X; + msp->tmp |= (d << msp->bits); // store data in msp->tmp + msp->bits += 8 - msp->unstuff; // number of bits added to msp->tmp + msp->unstuff = ((d & 0xFF) == 0xFF); // unstuffing for next byte + } + frwd_read(msp); // read 32 bits more + } + + //************************************************************************/ + /** @brief Initialize frwd_struct struct and reads some bytes + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + * @param [in] data is a pointer to the start of data + * @param [in] size is the number of byte in the bitstream + */ + template + static inline + void frwd_init8(frwd_struct *msp, const ui8* data, int size) + { + msp->data = data; + msp->tmp = 0; + msp->bits = 0; + msp->unstuff = 0; + msp->size = size; + frwd_read8(msp); // read 8 bits + } + + //************************************************************************/ + /** @brief Consume num_bits bits from the bitstream of frwd_struct + * + * @param [in] msp is a pointer to frwd_struct + * @param [in] num_bits is the number of bit to consume + */ + static inline + void frwd_advance(frwd_struct *msp, ui32 num_bits) + { + assert(num_bits <= msp->bits); + msp->tmp >>= num_bits; // consume num_bits + msp->bits -= num_bits; + } + + //************************************************************************/ + /** @brief Fetches 32 bits from the frwd_struct bitstream + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + */ + template + static inline + ui32 frwd_fetch(frwd_struct *msp) + { + if (msp->bits < 32) + { + frwd_read(msp); + if (msp->bits < 32) //need to test + frwd_read(msp); + } + return (ui32)msp->tmp; + } + + //************************************************************************/ + /** @brief Fetches up to 64 bits from the frwd_struct bitstream + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + */ + template + static inline + ui64 frwd_fetch64(frwd_struct *msp) + { + while (msp->bits <= 56) + frwd_read8(msp); + return msp->tmp; + } + + //************************************************************************/ + /** @brief Decodes one codeblock, processing the cleanup, siginificance + * propagation, and magnitude refinement pass + * + * @param [in] coded_data is a pointer to bitstream + * @param [in] decoded_data is a pointer to decoded codeblock data buf. + * @param [in] missing_msbs is the number of missing MSBs + * @param [in] num_passes is the number of passes: 1 if CUP only, + * 2 for CUP+SPP, and 3 for CUP+SPP+MRP + * @param [in] lengths1 is the length of cleanup pass + * @param [in] lengths2 is the length of refinement passes (either SPP + * only or SPP+MRP) + * @param [in] width is the decoded codeblock width + * @param [in] height is the decoded codeblock height + * @param [in] stride is the decoded codeblock buffer stride + * @param [in] stripe_causal is true for stripe causal mode + */ + bool ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) + { + // static bool insufficient_precision = false; + // static bool modify_code = false; + // static bool truncate_spp_mrp = false; + + if (num_passes > 1 && lengths2 == 0) + { + OJPH_WARN(0x00010001, "A malformed codeblock that has more than " + "one coding pass, but zero length for " + "2nd and potential 3rd pass."); + num_passes = 1; + } + + if (num_passes > 3) + { + OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " + "This codeblocks has %d passes.", + num_passes); + return false; + } + + // if (missing_msbs > 30) // p < 0 + // { + // if (insufficient_precision == false) + // { + // insufficient_precision = true; + // OJPH_WARN(0x00010003, "32 bits are not enough to decode this " + // "codeblock. This message will not be " + // "displayed again."); + // } + // return false; + // } + // else if (missing_msbs == 30) // p == 0 + // { // not enough precision to decode and set the bin center to 1 + // if (modify_code == false) { + // modify_code = true; + // OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " + // "pass. The code can be modified to support " + // "this case. This message will not be " + // "displayed again."); + // } + // return false; // 32 bits are not enough to decode this + // } + // else if (missing_msbs == 29) // if p is 1, then num_passes must be 1 + // { + // if (num_passes > 1) { + // num_passes = 1; + // if (truncate_spp_mrp == false) { + // truncate_spp_mrp = true; + // OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " + // "nor MagRef passes; both will be skipped. " + // "This message will not be displayed " + // "again."); + // } + // } + // } + ui32 p = 62 - missing_msbs; // The least significant bitplane for CUP + // There is a way to handle the case of p == 0, but a different path + // is required + + if (lengths1 < 2) + { + OJPH_WARN(0x00010006, "Wrong codeblock length."); + return false; + } + + // read scup and fix the bytes there + int lcup, scup; + lcup = (int)lengths1; // length of CUP + //scup is the length of MEL + VLC + scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF); + if (scup < 2 || scup > lcup || scup > 4079) //something is wrong + return false; + + // The temporary storage scratch holds two types of data in an + // interleaved fashion. The interleaving allows us to use one + // memory pointer. + // We have one entry for a decoded VLC code, and one entry for UVLC. + // Entries are 16 bits each, corresponding to one quad, + // but since we want to use XMM registers of the SSE family + // of SIMD; we allocated 16 bytes or more per quad row; that is, + // the width is no smaller than 16 bytes (or 8 entries), and the + // height is 512 quads + // Each VLC entry contains, in the following order, starting + // from MSB + // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits) + // Each entry in UVLC contains u_q + // One extra row to handle the case of SPP propagating downwards + // when codeblock width is 4 + ui16 scratch[8 * 513] = {0}; // 8 kB + + // We need an extra two entries (one inf and one u_q) beyond + // the last column. + // If the block width is 4 (2 quads), then we use sstr of 8 + // (enough for 4 quads). If width is 8 (4 quads) we use + // sstr is 16 (enough for 8 quads). For a width of 16 (8 + // quads), we use 24 (enough for 12 quads). + ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8 + + ui32 mmsbp2 = missing_msbs + 2; + + // The cleanup pass is decoded in two steps; in step one, + // the VLC and MEL segments are decoded, generating a record that + // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k. + // This information should be sufficient for the next step. + // In step 2, we decode the MagSgn segment. + + // step 1 decoding VLC and MEL segments + { + // init structures + dec_mel_st mel; + mel_init(&mel, coded_data, lcup, scup); + rev_struct vlc; + rev_init8(&vlc, coded_data, lcup, scup); + + int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm + // data represented as runs of 0 events + // See mel_decode description + + ui64 vlc_val; + ui32 c_q = 0; + ui16 *sp = scratch; + //initial quad row + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // first quad + vlc_val = rev_fetch64(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 1 in ITU T.814 + c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2); + + //remove data from vlc stream (0 bits are removed if vlc is not used) + vlc_val = rev_advance64(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + //prepare context for the next quad, eqn. 1 in ITU T.814 + c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2); + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance64(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from + { // the MEL run of events + run -= 2; //subtract 2, since events number if multiplied by 2 + + uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by + // is 0x40 + + if (run < 0)//if run is consumed (run is -1 or -2), get another run + run = mel_get_run(&mel); + } + //run -= (uvlc_mode == 0xc0) ? 2 : 0; + //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + + //decode uvlc_mode to get u for both quads + ui32 idx = uvlc_mode + (ui32)(vlc_val & 0x3F); + ui32 uvlc_entry = uvlc_tbl0[idx]; + ui16 u_bias = uvlc_bias[idx]; + //remove total prefix length + vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; // suffix length for 2 quads + ui32 tmp = (ui32)(vlc_val&((1<>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); + ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); + + // decode u_q extensions, which is needed only when u_q > 32 + ui16 u_ext; bool cond0, cond1; + cond0 = u_q0 - (u_bias & 0x3) > 32; + u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0); + vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); + u_q0 = (ui16)(u_q0 + (u_ext << 2)); + sp[1] = (ui16)(u_q0 + 1); // kappa = 1 + cond1 = u_q1 - (u_bias >> 2) > 32; + u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0); + vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); + u_q1 = (ui16)(u_q1 + (u_ext << 2)); + sp[3] = (ui16)(u_q1 + 1); // kappa = 1 + } + sp[0] = sp[1] = 0; + + //non initial quad rows + for (ui32 y = 2; y < height; y += 2) + { + c_q = 0; // context + ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads + + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // sigma_q (n, ne, nf) + c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4); + + // first quad + vlc_val = rev_fetch64(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number is multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 2 in ITU T.814 + // sigma_q (w, sw) + c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[0 - (si32)sstr] & 0x80; + // sigma_q (n, ne, nf) + c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4); + + //remove data from vlc stream (0 bits are removed if vlc is unused) + vlc_val = rev_advance64(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + // partial c_q, will be completed when we process the next quad + // sigma_q (w, sw) + c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[2 - (si32)sstr] & 0x80; + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance64(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = (ui32)(vlc_val&((1<>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); + ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q + + // decode u_q extensions, which is needed only when u_q > 32 + ui16 u_ext; bool cond0, cond1; + cond0 = u_q0 > 32; + u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0); + vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); + u_q0 = (ui16)(u_q0 + (u_ext << 2)); + sp[1] = u_q0; + cond1 = u_q1 > 32; + u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0); + vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); + u_q1 = (ui16)(u_q1 + (u_ext << 2)); + sp[3] = u_q1; + } + sp[0] = sp[1] = 0; + } + } + + // step2 we decode magsgn + { + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We need an extra entry to handle the case of vp[1] + // when vp is at the last column. + // Here, we allocate 4 instead of 1 to make the buffer size + // a multipled of 16 bytes. + const int v_n_size = 512 + 4; + ui64 v_n_scratch[v_n_size] = {0}; // 4+ kB + + frwd_struct magsgn; + frwd_init8<0xFF>(&magsgn, coded_data, lcup - scup); + + const ui16 *sp = scratch; + ui64 *vp = v_n_scratch; + ui64 *dp = decoded_data; + + ui64 prev_v_n = 0; + for (ui32 x = 0; x < width; sp += 2, ++vp) + { + ui32 inf = sp[0]; + ui32 U_q = sp[1]; + if (U_q > mmsbp2) + return false; + + ui64 v_n; + ui64 val = 0; + ui32 bit = 0; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= width) + { ++vp; break; } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + + for (ui32 y = 2; y < height; y += 2) + { + const ui16 *sp = scratch + (y >> 1) * sstr; + ui64 *vp = v_n_scratch; + ui64 *dp = decoded_data + y * stride; + + prev_v_n = 0; + for (ui32 x = 0; x < width; sp += 2, ++vp) + { + ui32 inf = sp[0]; + ui32 u_q = sp[1]; + + ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1? + ui32 emax = 63 - count_leading_zeros(2 | vp[0] | vp[1]); // emax-1 + ui32 kappa = gamma ? emax : 1; + + ui32 U_q = u_q + kappa; + if (U_q > mmsbp2) + return false; + + ui64 v_n; + ui64 val = 0; + ui32 bit = 0; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= width) + { ++vp; break; } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + } + } + + if (num_passes > 1) + { + // We use scratch again, we can divide it into multiple regions + // sigma holds all the significant samples, and it cannot + // be modified after it is set. it will be used during the + // Magnitude Refinement Pass + ui16* const sigma = scratch; + + ui32 mstr = (width + 3u) >> 2; // divide by 4, since each + // ui16 contains 4 columns + mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8 + + // We re-arrange quad significance, where each 4 consecutive + // bits represent one quad, into column significance, where, + // each 4 consequtive bits represent one column of 4 rows + { + ui32 y; + for (y = 0; y < height; y += 4) + { + ui16* sp = scratch + (y >> 1) * sstr; + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 4, sp += 4, ++dp) { + ui32 t0 = 0, t1 = 0; + t0 = ((sp[0 ] & 0x30u) >> 4) | ((sp[0 ] & 0xC0u) >> 2); + t0 |= ((sp[2 ] & 0x30u) << 4) | ((sp[2 ] & 0xC0u) << 6); + t1 = ((sp[0+sstr] & 0x30u) >> 2) | ((sp[0+sstr] & 0xC0u) ); + t1 |= ((sp[2+sstr] & 0x30u) << 6) | ((sp[2+sstr] & 0xC0u) << 8); + dp[0] = (ui16)(t0 | t1); + } + dp[0] = 0; // set an extra entry on the right with 0 + } + { + // reset one row after the codeblock + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 4, ++dp) + dp[0] = 0; + dp[0] = 0; // set an extra entry on the right with 0 + } + } + + // We perform Significance Propagation Pass here + { + // This stores significance information of the previous + // 4 rows. Significance information in this array includes + // all signicant samples in bitplane p - 1; that is, + // significant samples for bitplane p (discovered during the + // cleanup pass and stored in sigma) and samples that have recently + // became significant (during the SPP) in bitplane p-1. + // We store enough for the widest row, containing 1024 columns, + // which is equivalent to 256 of ui16, since each stores 4 columns. + // We add an extra 8 entries, just in case we need more + ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes + + frwd_struct sigprop; + frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 pattern = 0xFFFFu; // a pattern needed samples + if (height - y < 4) { + pattern = 0x7777u; + if (height - y < 3) { + pattern = 0x3333u; + if (height - y < 2) + pattern = 0x1111u; + } + } + + // prev holds sign. info. for the previous quad, together + // with the rows on top of it and below it. + ui32 prev = 0; + ui16 *prev_sig = prev_row_sig; + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui64 *dpp = decoded_data + y * stride; + for (ui32 x = 0; x < width; x += 4, ++cur_sig, ++prev_sig) + { + // only rows and columns inside the stripe are included + si32 s = (si32)x + 4 - (si32)width; + s = ojph_max(s, 0); + pattern = pattern >> (s * 4); + + // We first find locations that need to be tested (potential + // SPP members); these location will end up in mbr + // In each iteration, we produce 16 bits because cwd can have + // up to 16 bits of significance information, followed by the + // corresponding 16 bits of sign information; therefore, it is + // sufficient to fetch 32 bit data per loop. + + // Althougth we are interested in 16 bits only, we load 32 bits. + // For the 16 bits we are producing, we need the next 4 bits -- + // We need data for at least 5 columns out of 8. + // Therefore loading 32 bits is easier than loading 16 bits + // twice. + ui32 ps = *(ui32*)prev_sig; + ui32 ns = *(ui32*)(cur_sig + mstr); + ui32 u = (ps & 0x88888888) >> 3; // the row on top + if (!stripe_causal) + u |= (ns & 0x11111111) << 3; // the row below + + ui32 cs = *(ui32*)cur_sig; + // vertical integration + ui32 mbr = cs; // this sig. info. + mbr |= (cs & 0x77777777) << 1; //above neighbors + mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors + mbr |= u; + // horizontal integration + ui32 t = mbr; + mbr |= t << 4; // neighbors on the left + mbr |= t >> 4; // neighbors on the right + mbr |= prev >> 12; // significance of previous group + + // remove outside samples, and already significant samples + mbr &= pattern; + mbr &= ~cs; + + // find samples that become significant during the SPP + ui32 new_sig = mbr; + if (new_sig) + { + ui64 cwd = frwd_fetch<0>(&sigprop); + + ui32 cnt = 0; + ui32 col_mask = 0xFu; + ui32 inv_sig = ~cs & pattern; + for (int i = 0; i < 16; i += 4, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan one column + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x33u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x76u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xECu << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xC8u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + } + + if (new_sig) + { + // new_sig has newly-discovered sig. samples during SPP + // find the signs and update decoded_data + ui64 *dp = dpp + x; + ui64 val = 3u << (p - 2); + col_mask = 0xFu; + for (int i = 0; i < 4; ++i, ++dp, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan 4 signs + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + assert(dp[0] == 0); + dp[0] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[stride] == 0); + dp[stride] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[2 * stride] == 0); + dp[2 * stride] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[3 * stride] == 0); + dp[3 * stride] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + } + } + frwd_advance(&sigprop, cnt); + } + + new_sig |= cs; + *prev_sig = (ui16)(new_sig); + + // vertical integration for the new sig. info. + t = new_sig; + new_sig |= (t & 0x7777) << 1; //above neighbors + new_sig |= (t & 0xEEEE) >> 1; //below neighbors + // add sig. info. from the row on top and below + prev = new_sig | u; + // we need only the bits in 0xF000 + prev &= 0xF000; + } + } + } + + // We perform Magnitude Refinement Pass here + if (num_passes > 2) + { + rev_struct magref; + rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 *cur_sig = (ui32*)(sigma + (y >> 2) * mstr); + ui64 *dpp = decoded_data + y * stride; + ui64 half = 1ULL << (p - 2); + for (ui32 i = 0; i < width; i += 8) + { + //Process one entry from sigma array at a time + // Each nibble (4 bits) in the sigma array represents 4 rows, + // and the 32 bits contain 8 columns + ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data + ui32 sig = *cur_sig++; // 32 bit that will be processed now + ui32 col_mask = 0xFu; // a mask for a column in sig + if (sig) // if any of the 32 bits are set + { + for (int j = 0; j < 8; ++j) //one column at a time + { + if (sig & col_mask) // lowest nibble + { + ui64 *dp = dpp + i + j; // next column in decoded samples + ui32 sample_mask = 0x11111111u & col_mask; //LSB + + for (int k = 0; k < 4; ++k) { + if (sig & sample_mask) //if LSB is set + { + assert(dp[0] != 0); // decoded value cannot be zero + assert((dp[0] & half) == 0); // no half + ui64 sym = cwd & 1; // get it value + sym = (1 - sym) << (p - 1); // previous center of bin + sym |= half; // put half the center of bin + dp[0] ^= sym; // remove old bin center and put new + cwd >>= 1; // consume word + } + sample_mask += sample_mask; //next row + dp += stride; // next samples row + } + } + col_mask <<= 4; //next column + } + } + // consume data according to the number of bits set + rev_advance_mrp(&magref, population_count(sig)); + } + } + } + } + return true; + } + } +} \ No newline at end of file diff --git a/src/core/coding/ojph_block_decoder_avx2.cpp b/src/core/coding/ojph_block_decoder_avx2.cpp new file mode 100644 index 0000000..156ba1a --- /dev/null +++ b/src/core/coding/ojph_block_decoder_avx2.cpp @@ -0,0 +1,2041 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2022, Aous Naman +// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2022, The University of New South Wales, Australia +// Copyright (c) 2024, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_block_decoder_avx2.cpp +//***************************************************************************/ + +//***************************************************************************/ +/** @file ojph_block_decoder_avx2.cpp + * @brief implements a faster HTJ2K block decoder using avx2 + */ + +#include +#include + +#include +#include +#include "ojph_block_common.h" +#include "ojph_block_decoder.h" +#include "ojph_arch.h" +#include "ojph_message.h" + +#include + +namespace ojph { + namespace local { + + //************************************************************************/ + /** @brief MEL state structure for reading and decoding the MEL bitstream + * + * A number of events is decoded from the MEL bitstream ahead of time + * and stored in run/num_runs. + * Each run represents the number of zero events before a one event. + */ + struct dec_mel_st { + dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false), + k(0), num_runs(0), runs(0) + {} + // data decoding machinery + ui8* data; //!bits > 32) //there are enough bits in the tmp variable + return; // return without reading new data + + ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted + if (melp->size > 4) { // if there is data in the MEL segment + val = *(ui32*)melp->data; // read 32 bits from MEL data + melp->data += 4; // advance pointer + melp->size -= 4; // reduce counter + } + else if (melp->size > 0) + { // 4 or less + int i = 0; + while (melp->size > 1) { + ui32 v = *melp->data++; // read one byte at a time + ui32 m = ~(0xFFu << i); // mask of location + val = (val & m) | (v << i);// put one byte in its correct location + --melp->size; + i += 8; + } + // size equal to 1 + ui32 v = *melp->data++; // the one before the last is different + v |= 0xF; // MEL and VLC segments can overlap + ui32 m = ~(0xFFu << i); + val = (val & m) | (v << i); + --melp->size; + } + + // next we unstuff them before adding them to the buffer + int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if + // the previously read byte requires + // unstuffing + + // data is unstuffed and accumulated in t + // bits has the number of bits in t + ui32 t = val & 0xFF; + bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing + bits -= unstuff; // there is one less bit in t if unstuffing is needed + t = t << (8 - unstuff); // move up to make room for the next byte + + //this is a repeat of the above + t |= (val>>8) & 0xFF; + unstuff = (((val >> 8) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>16) & 0xFF; + unstuff = (((val >> 16) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>24) & 0xFF; + melp->unstuff = (((val >> 24) & 0xFF) == 0xFF); + + // move t to tmp, and push the result all the way up, so we read from + // the MSB + melp->tmp |= ((ui64)t) << (64 - bits - melp->bits); + melp->bits += bits; //increment the number of bits in tmp + } + + //************************************************************************/ + /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs + * + * Runs are stored in "runs" and the number of runs in "num_runs". + * Each run represents a number of zero events that may or may not + * terminate in a 1 event. + * Each run is stored in 7 bits. The LSB is 1 if the run terminates in + * a 1 event, 0 otherwise. The next 6 bits, for the case terminating + * with 1, contain the number of consecutive 0 zero events * 2; for the + * case terminating with 0, they store (number of consecutive 0 zero + * events - 1) * 2. + * A total of 6 bits (made up of 1 + 5) should have been enough. + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + void mel_decode(dec_mel_st *melp) + { + static const int mel_exp[13] = { //MEL exponents + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5 + }; + + if (melp->bits < 6) // if there are less than 6 bits in tmp + mel_read(melp); // then read from the MEL bitstream + // 6 bits is the largest decodable MEL cwd + + //repeat so long that there is enough decodable bits in tmp, + // and the runs store is not full (num_runs < 8) + while (melp->bits >= 6 && melp->num_runs < 8) + { + int eval = mel_exp[melp->k]; // number of bits associated with state + int run = 0; + if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB) + { //one is found + run = 1 << eval; + run--; // consecutive runs of 0 events - 1 + melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12 + melp->tmp <<= 1; // consume one bit from tmp + melp->bits -= 1; + run = run << 1; // a stretch of zeros not terminating in one + } + else + { //0 is found + run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1); + melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0 + melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6) + melp->bits -= eval + 1; + run = (run << 1) + 1; // a stretch of zeros terminating with one + } + eval = melp->num_runs * 7; // 7 bits per run + melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient + melp->runs |= ((ui64)run) << eval; // store the value in runs + melp->num_runs++; // increment count + } + } + + //************************************************************************/ + /** @brief Initiates a dec_mel_st structure for MEL decoding and reads + * some bytes in order to get the read address to a multiple + * of 4 + * + * @param [in] melp is a pointer to dec_mel_st structure + * @param [in] bbuf is a pointer to byte buffer + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup) + { + melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL + melp->bits = 0; // 0 bits in tmp + melp->tmp = 0; // + melp->unstuff = false; // no unstuffing + melp->size = scup - 1; // size is the length of MEL+VLC-1 + melp->k = 0; // 0 for state + melp->num_runs = 0; // num_runs is 0 + melp->runs = 0; // + + //This code is borrowed; original is for a different architecture + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment + int num = 4 - (int)(intptr_t(melp->data) & 0x3); + for (int i = 0; i < num; ++i) { // this code is similar to mel_read + assert(melp->unstuff == false || melp->data[0] <= 0x8F); + ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed + //set data to 0xFF + if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF + // see the standard + melp->data += melp->size-- > 0; //increment if the end is not reached + int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1 + melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp + melp->bits += d_bits; //increment tmp by number of bits + melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs + //unstuffing + } + melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit + // is the MSB + } + + //************************************************************************/ + /** @brief Retrieves one run from dec_mel_st; if there are no runs stored + * MEL segment is decoded + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + int mel_get_run(dec_mel_st *melp) + { + if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment + mel_decode(melp); + + int t = melp->runs & 0x7F; //retrieve one run + melp->runs >>= 7; // remove the retrieved run + melp->num_runs--; + return t; // return run + } + + //************************************************************************/ + /** @brief A structure for reading and unstuffing a segment that grows + * backward, such as VLC and MRP + */ + struct rev_struct { + rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false) + {} + //storage + ui8* data; //!bits > 32) // if there are more than 32 bits in tmp, then + return; // reading 32 bits can overflow vlcp->tmp + ui32 val = 0; + //the next line (the if statement) needs to be tested first + if (vlcp->size > 3) // if there are more than 3 bytes left in VLC + { + // (vlcp->data - 3) move pointer back to read 32 bits at once + val = *(ui32*)(vlcp->data - 3); // then read 32 bits + vlcp->data -= 4; // move data pointer back by 4 + vlcp->size -= 4; // reduce available byte by 4 + } + else if (vlcp->size > 0) + { // 4 or less + int i = 24; + while (vlcp->size > 0) { + ui32 v = *vlcp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --vlcp->size; + i -= 8; + } + } + + __m128i tmp_vec = _mm_set1_epi32((int32_t)val); + tmp_vec = _mm_srlv_epi32(tmp_vec, _mm_setr_epi32(24, 16, 8, 0)); + tmp_vec = _mm_and_si128(tmp_vec, _mm_set1_epi32(0xff)); + + __m128i unstuff_vec = _mm_cmpgt_epi32(tmp_vec, _mm_set1_epi32(0x8F)); + bool unstuff_next = _mm_extract_epi32(unstuff_vec, 3); + unstuff_vec = _mm_slli_si128(unstuff_vec, 4); + unstuff_vec = _mm_insert_epi32(unstuff_vec, vlcp->unstuff * 0xffffffff, 0); + + __m128i val_7f = _mm_set1_epi32(0x7F); + __m128i this_byte_7f = _mm_cmpeq_epi32(_mm_and_si128(tmp_vec, val_7f), val_7f); + unstuff_vec = _mm_and_si128(unstuff_vec, this_byte_7f); + unstuff_vec = _mm_srli_epi32(unstuff_vec, 31); + + __m128i inc_sum = _mm_sub_epi32(_mm_set1_epi32(8), unstuff_vec); + inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4)); + inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8)); + ui32 total_bits = (ui32)_mm_extract_epi32(inc_sum, 3); + + __m128i final_shift = _mm_slli_si128(inc_sum, 4); + tmp_vec = _mm_sllv_epi32(tmp_vec, final_shift); + tmp_vec = _mm_or_si128(tmp_vec, _mm_bsrli_si128(tmp_vec, 8)); + + ui64 tmp = (ui32)_mm_cvtsi128_si32(tmp_vec) | (ui32)_mm_extract_epi32(tmp_vec, 1); + + vlcp->unstuff = unstuff_next; + vlcp->tmp |= tmp << vlcp->bits; + vlcp->bits += total_bits; + } + + //************************************************************************/ + /** @brief Initiates the rev_struct structure and reads a few bytes to + * move the read address to multiple of 4 + * + * There is another similar rev_init_mrp subroutine. The difference is + * that this one, rev_init, discards the first 12 bits (they have the + * sum of the lengths of VLC and MEL segments), and first unstuff depends + * on first 4 bits. + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup) + { + //first byte has only the upper 4 bits + vlcp->data = data + lcup - 2; + + //size can not be larger than this, in fact it should be smaller + vlcp->size = scup - 2; + + ui32 d = *vlcp->data--; // read one byte (this is a half byte) + vlcp->tmp = d >> 4; // both initialize and set + vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard + vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream. + // To read 32 bits, read from (vlcp->data - 3) + int num = 1 + (int)(intptr_t(vlcp->data) & 0x3); + int tnum = num < vlcp->size ? num : vlcp->size; + for (int i = 0; i < tnum; ++i) { + ui64 d; + d = *vlcp->data--; // read one byte and move read pointer + //check if the last byte was >0x8F (unstuff == true) and this is 0x7F + ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp + vlcp->bits += d_bits; + vlcp->unstuff = d > 0x8F; // for next byte + } + vlcp->size -= tnum; + rev_read(vlcp); // read another 32 buts + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, vlcp->tmp must have no less than 33 bits + * + * @param [in] vlcp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch(rev_struct *vlcp) + { + if (vlcp->bits < 32) // if there are less then 32 bits, read more + { + rev_read(vlcp); // read 32 bits, but unstuffing might reduce this + if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits + rev_read(vlcp); // read another 32 + } + return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui32 rev_advance(rev_struct *vlcp, ui32 num_bits) + { + assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits + vlcp->tmp >>= num_bits; // remove bits + vlcp->bits -= num_bits; // decrement the number of bits + return (ui32)vlcp->tmp; + } + + //************************************************************************/ + /** @brief Reads and unstuffs from rev_struct + * + * This is different than rev_read in that this fills in zeros when the + * the available data is consumed. The other does not care about the + * values when all data is consumed. + * + * See rev_read for more information about unstuffing + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + void rev_read_mrp(rev_struct *mrp) + { + //process 4 bytes at a time + if (mrp->bits > 32) + return; + ui32 val = 0; + if (mrp->size > 3) // If there are 3 byte or more + { // (mrp->data - 3) move pointer back to read 32 bits at once + val = *(ui32*)(mrp->data - 3); // read 32 bits + mrp->data -= 4; // move back pointer + mrp->size -= 4; // reduce count + } + else if (mrp->size > 0) + { + int i = 24; + while (mrp->size > 0) { + ui32 v = *mrp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --mrp->size; + i -= 8; + } + } + + //accumulate in tmp, and keep count in bits + ui32 bits, tmp = val >> 24; + + //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F + bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0); + bool unstuff = (val >> 24) > 0x8F; + + //process the next byte + tmp |= ((val >> 16) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 16) & 0xFF) > 0x8F; + + tmp |= ((val >> 8) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 8) & 0xFF) > 0x8F; + + tmp |= (val & 0xFF) << bits; + bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = (val & 0xFF) > 0x8F; + + mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer + mrp->bits += bits; + mrp->unstuff = unstuff; // next byte + } + + //************************************************************************/ + /** @brief Initialized rev_struct structure for MRP segment, and reads + * a number of bytes such that the next 32 bits read are from + * an address that is a multiple of 4. Note this is designed for + * an architecture that read size must be compatible with the + * alignment of the read address + * + * There is another similar subroutine rev_init. This subroutine does + * NOT skip the first 12 bits, and starts with unstuff set to true. + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] len2 is the length of SPP+MRP segments + */ + static inline + void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2) + { + mrp->data = data + lcup + len2 - 1; + mrp->size = len2; + mrp->unstuff = true; + mrp->bits = 0; + mrp->tmp = 0; + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream + int num = 1 + (int)(intptr_t(mrp->data) & 0x3); + for (int i = 0; i < num; ++i) { + ui64 d; + //read a byte, 0 if no more data + d = (mrp->size-- > 0) ? *mrp->data-- : 0; + //check if unstuffing is needed + ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp + mrp->bits += d_bits; + mrp->unstuff = d > 0x8F; // for next byte + } + rev_read_mrp(mrp); + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, mrp->tmp must have no less than 33 bits + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch_mrp(rev_struct *mrp) + { + if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp + { + rev_read_mrp(mrp); // read 30-32 bits from mrp + if (mrp->bits < 32) // if there is a space of 32 bits + rev_read_mrp(mrp); // read more + } + return (ui32)mrp->tmp; // return the head of mrp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits) + { + assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits + mrp->tmp >>= num_bits; // discard the lowest num_bits bits + mrp->bits -= num_bits; + return (ui32)mrp->tmp; // return data after consumption + } + + //************************************************************************/ + /** @brief State structure for reading and unstuffing of forward-growing + * bitstreams; these are: MagSgn and SPP bitstreams + */ + struct frwd_struct { + const ui8* data; //! + static inline + void frwd_read(frwd_struct *msp) + { + assert(msp->bits <= 128); + + __m128i offset, val, validity, all_xff; + val = _mm_loadu_si128((__m128i*)msp->data); + int bytes = msp->size >= 16 ? 16 : msp->size; + validity = _mm_set1_epi8((char)bytes); + msp->data += bytes; + msp->size -= bytes; + int bits = 128; + offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100); + validity = _mm_cmpgt_epi8(validity, offset); + all_xff = _mm_set1_epi8(-1); + if (X == 0xFF) // the compiler should remove this if statement + { + __m128i t = _mm_xor_si128(validity, all_xff); // complement + val = _mm_or_si128(t, val); // fill with 0xFF + } + else if (X == 0) + val = _mm_and_si128(validity, val); // fill with zeros + else + assert(0); + + __m128i ff_bytes; + ff_bytes = _mm_cmpeq_epi8(val, all_xff); + ff_bytes = _mm_and_si128(ff_bytes, validity); + ui32 flags = (ui32)_mm_movemask_epi8(ff_bytes); + flags <<= 1; // unstuff following byte + ui32 next_unstuff = flags >> 16; + flags |= msp->unstuff; + flags &= 0xFFFF; + while (flags) + { // bit unstuffing occurs on average once every 256 bytes + // therefore it is not an issue if it is a bit slow + // here we process 16 bytes + --bits; // consuming one stuffing bit + + ui32 loc = 31 - count_leading_zeros(flags); + flags ^= 1 << loc; + + __m128i m, t, c; + t = _mm_set1_epi8((char)loc); + m = _mm_cmpgt_epi8(offset, t); + + t = _mm_and_si128(m, val); // keep bits at locations larger than loc + c = _mm_srli_epi64(t, 1); // 1 bits left + t = _mm_srli_si128(t, 8); // 8 bytes left + t = _mm_slli_epi64(t, 63); // keep the MSB only + t = _mm_or_si128(t, c); // combine the above 3 steps + + val = _mm_or_si128(t, _mm_andnot_si128(m, val)); + } + + // combine with earlier data + assert(msp->bits >= 0 && msp->bits <= 128); + int cur_bytes = msp->bits >> 3; + int cur_bits = msp->bits & 7; + __m128i b1, b2; + b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits)); + b2 = _mm_slli_si128(val, 8); // 8 bytes right + b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits)); + b1 = _mm_or_si128(b1, b2); + b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes)); + b2 = _mm_or_si128(b1, b2); + _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2); + + int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits; + cur_bytes = (msp->bits + (ui32)consumed_bits + 7) >> 3; // round up + int upper = _mm_extract_epi16(val, 7); + upper >>= consumed_bits - 128 + 16; + msp->tmp[cur_bytes] = (ui8)upper; // copy byte + + msp->bits += (ui32)bits; + msp->unstuff = next_unstuff; // next unstuff + assert(msp->unstuff == 0 || msp->unstuff == 1); + } + + //************************************************************************/ + /** @brief Initialize frwd_struct struct and reads some bytes + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + * @param [in] data is a pointer to the start of data + * @param [in] size is the number of byte in the bitstream + */ + template + static inline + void frwd_init(frwd_struct *msp, const ui8* data, int size) + { + msp->data = data; + _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128()); + _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128()); + _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128()); + + msp->bits = 0; + msp->unstuff = 0; + msp->size = size; + + frwd_read(msp); // read 128 bits more + } + + //************************************************************************/ + /** @brief Consume num_bits bits from the bitstream of frwd_struct + * + * @param [in] msp is a pointer to frwd_struct + * @param [in] num_bits is the number of bit to consume + */ + static inline + void frwd_advance(frwd_struct *msp, ui32 num_bits) + { + assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128); + msp->bits -= num_bits; + + __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18)); + num_bits &= 63; + + __m128i v0, v1, c0, c1, t; + v0 = _mm_loadu_si128(p); + v1 = _mm_loadu_si128(p + 1); + + // shift right by num_bits + c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits)); + t = _mm_srli_si128(v0, 8); + t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits)); + c0 = _mm_or_si128(c0, t); + t = _mm_slli_si128(v1, 8); + t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits)); + c0 = _mm_or_si128(c0, t); + + _mm_storeu_si128((__m128i*)msp->tmp, c0); + + c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits)); + t = _mm_srli_si128(v1, 8); + t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits)); + c1 = _mm_or_si128(c1, t); + + _mm_storeu_si128((__m128i*)msp->tmp + 1, c1); + } + + //************************************************************************/ + /** @brief Fetches 32 bits from the frwd_struct bitstream + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + */ + template + static inline + __m128i frwd_fetch(frwd_struct *msp) + { + if (msp->bits <= 128) + { + frwd_read(msp); + if (msp->bits <= 128) //need to test + frwd_read(msp); + } + __m128i t = _mm_loadu_si128((__m128i*)msp->tmp); + return t; + } + + //************************************************************************/ + /** @brief decodes twos consecutive quads (one octet), using 32 bit data + * + * @param inf_u_q decoded VLC code, with interleaved u values + * @param U_q U values + * @param magsgn structure for forward data buffer + * @param p bitplane at which we are decoding + * @param vn used for handling E values (stores v_n values) + * @return __m256i decoded two quads + */ + static inline __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q, frwd_struct* magsgn, ui32 p, __m128i& vn) { + __m256i row = _mm256_setzero_si256(); + + // we keeps e_k, e_1, and rho in w2 + __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110)); + __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256()); + + if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant? + { + flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8)); + + // U_q holds U_q for this quad + // flags has e_k, e_1, and rho such that e_k is sitting in the + // 0x8000, e_1 in 0x800, and rho in 0x80 + + // next e_k and m_n + __m256i m_n; + __m256i w0 = _mm256_srli_epi32(flags, 15); // e_k + m_n = _mm256_sub_epi32(U_q, w0); + m_n = _mm256_andnot_si256(insig, m_n); + + // find cumulative sums + // to find at which bit in ms_vec the sample starts + __m256i inc_sum = m_n; // inclusive scan + inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4)); + inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8)); + int total_mn1 = _mm256_extract_epi16(inc_sum, 6); + int total_mn2 = _mm256_extract_epi16(inc_sum, 14); + + __m128i ms_vec0 = _mm_setzero_si128(); + __m128i ms_vec1 = _mm_setzero_si128(); + if (total_mn1) { + ms_vec0 = frwd_fetch<0xFF>(magsgn); + frwd_advance(magsgn, (ui32)total_mn1); + } + if (total_mn2) { + ms_vec1 = frwd_fetch<0xFF>(magsgn); + frwd_advance(magsgn, (ui32)total_mn2); + } + + __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1); + + __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4); // exclusive scan + + // find the starting byte and starting bit + __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3); + __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7)); + byte_idx = _mm256_shuffle_epi8(byte_idx, + _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000)); + byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100)); + __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx); + byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101)); + __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx); + + // shift samples values to correct location + bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16)); + + __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1); + __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1); + + __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx); + bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101)); + d0 = _mm256_mullo_epi16(d0, bit_shift); + d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB + d1 = _mm256_mullo_epi16(d1, bit_shift); + d1 = _mm256_and_si256(d1, _mm256_set1_epi32((si32)0xFF00FF00)); // 8 in MSB + d0 = _mm256_or_si256(d0, d1); + + // find location of e_k and mask + __m256i shift; + __m256i ones = _mm256_set1_epi32(1); + __m256i twos = _mm256_set1_epi32(2); + __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones); + U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F)); + U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0); + w0 = _mm256_sub_epi32(twos, w0); + shift = _mm256_sllv_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31 + ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones)); + + // next e_1 + w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800)); + w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256()); + w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position + ms_vec = _mm256_or_si256(ms_vec, w0); // e_1 + w0 = _mm256_slli_epi32(ms_vec, 31); // sign + ms_vec = _mm256_or_si256(ms_vec, ones); // bin center + __m256i tvn = ms_vec; + ms_vec = _mm256_add_epi32(ms_vec, twos);// + 2 + ms_vec = _mm256_slli_epi32(ms_vec, (si32)p - 1); + ms_vec = _mm256_or_si256(ms_vec, w0); // sign + row = _mm256_andnot_si256(insig, ms_vec); // significant only + + ms_vec = _mm256_andnot_si256(insig, tvn); // significant only + + tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504)); + + vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn)); + vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1)); + } + return row; + } + + + //************************************************************************/ + /** @brief decodes twos consecutive quads (one octet), using 16 bit data + * + * @param inf_u_q decoded VLC code, with interleaved u values + * @param U_q U values + * @param magsgn structure for forward data buffer + * @param p bitplane at which we are decoding + * @param vn used for handling E values (stores v_n values) + * @return __m128i decoded quad + */ + + static inline __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct* magsgn, ui32 p, __m128i& vn) { + + __m256i w0; // workers + __m256i insig; // lanes hold FF's if samples are insignificant + __m256i flags; // lanes hold e_k, e_1, and rho + + __m256i row = _mm256_setzero_si256(); + __m128i ddd = _mm_shuffle_epi8(inf_u_q, + _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100)); + w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd), + _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3)); + // we keeps e_k, e_1, and rho in w2 + flags = _mm256_and_si256(w0, + _mm256_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110, + (si16)0x8880, 0x4440, 0x2220, 0x1110, + (si16)0x8880, 0x4440, 0x2220, 0x1110, + (si16)0x8880, 0x4440, 0x2220, 0x1110)); + insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256()); + if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant? + { + ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q); + __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd), + _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3)); + flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8)); + + // U_q holds U_q for this quad + // flags has e_k, e_1, and rho such that e_k is sitting in the + // 0x8000, e_1 in 0x800, and rho in 0x80 + + // next e_k and m_n + __m256i m_n; + w0 = _mm256_srli_epi16(flags, 15); // e_k + m_n = _mm256_sub_epi16(U_q_avx, w0); + m_n = _mm256_andnot_si256(insig, m_n); + + // find cumulative sums + // to find at which bit in ms_vec the sample starts + __m256i inc_sum = m_n; // inclusive scan + inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2)); + inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4)); + inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8)); + int total_mn1 = _mm256_extract_epi16(inc_sum, 7); + int total_mn2 = _mm256_extract_epi16(inc_sum, 15); + __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2); // exclusive scan + + __m128i ms_vec0 = _mm_setzero_si128(); + __m128i ms_vec1 = _mm_setzero_si128(); + if (total_mn1) { + ms_vec0 = frwd_fetch<0xFF>(magsgn); + frwd_advance(magsgn, (ui32)total_mn1); + } + if (total_mn2) { + ms_vec1 = frwd_fetch<0xFF>(magsgn); + frwd_advance(magsgn, (ui32)total_mn2); + } + + __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1); + + // find the starting byte and starting bit + __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3); + __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7)); + byte_idx = _mm256_shuffle_epi8(byte_idx, + _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808, + 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808, + 0x0606, 0x0404, 0x0202, 0x0000)); + byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100)); + __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx); + byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101)); + __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx); + + // shift samples values to correct location + __m256i bit_shift = _mm256_shuffle_epi8( + _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, + 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1, + 1, 3, 7, 15, 31, 63, 127, -1), bit_idx); + bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101)); + d0 = _mm256_mullo_epi16(d0, bit_shift); + d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB + d1 = _mm256_mullo_epi16(d1, bit_shift); + d1 = _mm256_and_si256(d1, _mm256_set1_epi16((si16)0xFF00)); // 8 in MSB + d0 = _mm256_or_si256(d0, d1); + + // find location of e_k and mask + __m256i shift, t0, t1, Uq0, Uq1; + __m256i ones = _mm256_set1_epi16(1); + __m256i twos = _mm256_set1_epi16(2); + __m256i U_q_m1 = _mm256_sub_epi32(U_q_avx, ones); + Uq0 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F)); + Uq1 = _mm256_bsrli_epi128(U_q_m1, 14); + w0 = _mm256_sub_epi16(twos, w0); + t0 = _mm256_and_si256(w0, _mm256_set_epi64x(0, -1, 0, -1)); + t1 = _mm256_and_si256(w0, _mm256_set_epi64x(-1, 0, -1, 0)); + {//no _mm256_sllv_epi16 in avx2 + __m128i t_0_sse = _mm256_castsi256_si128(t0); + t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq0)); + __m128i t_1_sse = _mm256_extracti128_si256(t0 , 0x1); + t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq0, 0x1)); + t0 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1); + + t_0_sse = _mm256_castsi256_si128(t1); + t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq1)); + t_1_sse = _mm256_extracti128_si256(t1, 0x1); + t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq1, 0x1)); + t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1); + } + shift = _mm256_or_si256(t0, t1); + ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones)); + + // next e_1 + w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800)); + w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256()); + w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position + ms_vec = _mm256_or_si256(ms_vec, w0); // e_1 + w0 = _mm256_slli_epi16(ms_vec, 15); // sign + ms_vec = _mm256_or_si256(ms_vec, ones); // bin center + __m256i tvn = ms_vec; + ms_vec = _mm256_add_epi16(ms_vec, twos);// + 2 + ms_vec = _mm256_slli_epi16(ms_vec, (si32)p - 1); + ms_vec = _mm256_or_si256(ms_vec, w0); // sign + row = _mm256_andnot_si256(insig, ms_vec); // significant only + + ms_vec = _mm256_andnot_si256(insig, tvn); // significant only + + __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec, + _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1, + -1, -1, -1, -1, -1, -1, 0x0706, 0x0302)); + __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec, + _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1, + -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1)); + ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2); + + vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec)); + vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1)); + } + return row; + } + + // https://stackoverflow.com/a/58827596 + inline __m256i avx2_lzcnt_epi32(__m256i v) { + // prevent value from being rounded up to the next power of two + v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB + + v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float + v = _mm256_srli_epi32(v, 23); // shift down the exponent + v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias + v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32 + + return v; + } + + //************************************************************************/ + /** @brief Decodes one codeblock, processing the cleanup, siginificance + * propagation, and magnitude refinement pass + * + * @param [in] coded_data is a pointer to bitstream + * @param [in] decoded_data is a pointer to decoded codeblock data buf. + * @param [in] missing_msbs is the number of missing MSBs + * @param [in] num_passes is the number of passes: 1 if CUP only, + * 2 for CUP+SPP, and 3 for CUP+SPP+MRP + * @param [in] lengths1 is the length of cleanup pass + * @param [in] lengths2 is the length of refinement passes (either SPP + * only or SPP+MRP) + * @param [in] width is the decoded codeblock width + * @param [in] height is the decoded codeblock height + * @param [in] stride is the decoded codeblock buffer stride + * @param [in] stripe_causal is true for stripe causal mode + */ + bool ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) + { + static bool insufficient_precision = false; + static bool modify_code = false; + static bool truncate_spp_mrp = false; + + if (num_passes > 1 && lengths2 == 0) + { + OJPH_WARN(0x00010001, "A malformed codeblock that has more than " + "one coding pass, but zero length for " + "2nd and potential 3rd pass."); + num_passes = 1; + } + + if (num_passes > 3) + { + OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " + "This codeblocks has %d passes.", + num_passes); + return false; + } + + if (missing_msbs > 30) // p < 0 + { + if (insufficient_precision == false) + { + insufficient_precision = true; + OJPH_WARN(0x00010003, "32 bits are not enough to decode this " + "codeblock. This message will not be " + "displayed again."); + } + return false; + } + else if (missing_msbs == 30) // p == 0 + { // not enough precision to decode and set the bin center to 1 + if (modify_code == false) { + modify_code = true; + OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " + "pass. The code can be modified to support " + "this case. This message will not be " + "displayed again."); + } + return false; // 32 bits are not enough to decode this + } + else if (missing_msbs == 29) // if p is 1, then num_passes must be 1 + { + if (num_passes > 1) { + num_passes = 1; + if (truncate_spp_mrp == false) { + truncate_spp_mrp = true; + OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " + "nor MagRef passes; both will be skipped. " + "This message will not be displayed " + "again."); + } + } + } + ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP + // There is a way to handle the case of p == 0, but a different path + // is required + + if (lengths1 < 2) + { + OJPH_WARN(0x00010006, "Wrong codeblock length."); + return false; + } + + // read scup and fix the bytes there + int lcup, scup; + lcup = (int)lengths1; // length of CUP + //scup is the length of MEL + VLC + scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF); + if (scup < 2 || scup > lcup || scup > 4079) //something is wrong + return false; + + // The temporary storage scratch holds two types of data in an + // interleaved fashion. The interleaving allows us to use one + // memory pointer. + // We have one entry for a decoded VLC code, and one entry for UVLC. + // Entries are 16 bits each, corresponding to one quad, + // but since we want to use XMM registers of the SSE family + // of SIMD; we allocated 16 bytes or more per quad row; that is, + // the width is no smaller than 16 bytes (or 8 entries), and the + // height is 512 quads + // Each VLC entry contains, in the following order, starting + // from MSB + // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits) + // Each entry in UVLC contains u_q + // One extra row to handle the case of SPP propagating downwards + // when codeblock width is 4 + ui16 scratch[8 * 513] = {0}; // 8+ kB + + // We need an extra two entries (one inf and one u_q) beyond + // the last column. + // If the block width is 4 (2 quads), then we use sstr of 8 + // (enough for 4 quads). If width is 8 (4 quads) we use + // sstr is 16 (enough for 8 quads). For a width of 16 (8 + // quads), we use 24 (enough for 12 quads). + ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8 + + assert((stride & 0x3) == 0); + + ui32 mmsbp2 = missing_msbs + 2; + + // The cleanup pass is decoded in two steps; in step one, + // the VLC and MEL segments are decoded, generating a record that + // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k. + // This information should be sufficient for the next step. + // In step 2, we decode the MagSgn segment. + + // step 1 decoding VLC and MEL segments + { + // init structures + dec_mel_st mel; + mel_init(&mel, coded_data, lcup, scup); + rev_struct vlc; + rev_init(&vlc, coded_data, lcup, scup); + + int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm + // data represented as runs of 0 events + // See mel_decode description + + ui32 vlc_val; + ui32 c_q = 0; + ui16 *sp = scratch; + //initial quad row + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // first quad + vlc_val = rev_fetch(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 1 in ITU T.814 + c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2); + + //remove data from vlc stream (0 bits are removed if vlc is not used) + vlc_val = rev_advance(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + //prepare context for the next quad, eqn. 1 in ITU T.814 + c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2); + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from + { // the MEL run of events + run -= 2; //subtract 2, since events number if multiplied by 2 + + uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by + // is 0x40 + + if (run < 0)//if run is consumed (run is -1 or -2), get another run + run = mel_get_run(&mel); + } + //run -= (uvlc_mode == 0xc0) ? 2 : 0; + //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + + //decode uvlc_mode to get u for both quads + ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance(&vlc, len); + ojph_unused(vlc_val); //static code analysis: unused value + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<> 3) + (tmp >> len)); //kappa == 1 + sp[3] = u_q; + } + sp[0] = sp[1] = 0; + + //non initial quad rows + for (ui32 y = 2; y < height; y += 2) + { + c_q = 0; // context + ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads + + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // sigma_q (n, ne, nf) + c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4); + + // first quad + vlc_val = rev_fetch(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number is multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 2 in ITU T.814 + // sigma_q (w, sw) + c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[0 - (si32)sstr] & 0x80; + // sigma_q (n, ne, nf) + c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4); + + //remove data from vlc stream (0 bits are removed if vlc is unused) + vlc_val = rev_advance(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + // partial c_q, will be completed when we process the next quad + // sigma_q (w, sw) + c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[2 - (si32)sstr] & 0x80; + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance(&vlc, len); + ojph_unused(vlc_val); //static code analysis: unused value + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); + sp[1] = u_q; + u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q + sp[3] = u_q; + } + sp[0] = sp[1] = 0; + } + } + + // step2 we decode magsgn + // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit) + // The 32 bit path decode 16 bits data, for which one would think + // 16 bits are enough, because we want to put in the center of the + // bin. + // If you have mmsbp2 equals 16 bit, and reversible coding, and + // no bitplanes are missing, then we can decoding using the 16 bit + // path, but we are not doing this here. + if (mmsbp2 >= 16) + { + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We may go beyond the last entry by up to 4 entries. + // Here we allocate additional 8 entries. + // There are two rows in this structure, the bottom + // row is used to store processed entries. + const int v_n_size = 512 + 16; + ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB + + frwd_struct magsgn; + frwd_init<0xFF>(&magsgn, coded_data, lcup - scup); + + const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2); + + { + ui16 *sp = scratch; + ui32 *vp = v_n_scratch; + ui32 *dp = decoded_data; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) + { + __m128i vn = _mm_set1_epi32(2); + + __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp)); + inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1)); + + __m256i U_q = _mm256_srli_epi32(inf_u_q, 16); + __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2); + if (!_mm256_testz_si256(w, w)) { + return false; + } + + __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn); + row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row)); + _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1)); + + __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp); + w0 = _mm_or_si128(w0, vn); + _mm_storeu_si128((__m128i*)vp, w0); + } + } + + for (ui32 y = 2; y < height; y += 2) + { + { + // perform 31 - count_leading_zeros(*vp) here + ui32 *vp = v_n_scratch; + ui16* sp = scratch + (y >> 1) * sstr; + + const __m256i avx_31 = _mm256_set1_epi32(31); + const __m256i avx_f0 = _mm256_set1_epi32(0xF0); + const __m256i avx_1 = _mm256_set1_epi32(1); + const __m256i avx_0 = _mm256_setzero_si256(); + + for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) { + __m256i v = _mm256_loadu_si256((__m256i*)vp); + __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1)); + v = _mm256_or_si256(v, v_p1); + v = avx2_lzcnt_epi32(v); + v = _mm256_sub_epi32(avx_31, v); + + __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp); + __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0); + __m256i w0 = _mm256_sub_epi32(gamma, avx_1); + gamma = _mm256_and_si256(gamma, w0); + gamma = _mm256_cmpeq_epi32(gamma, avx_0); + + v = _mm256_andnot_si256(gamma, v); + v = _mm256_max_epi32(v, avx_1); + + inf_u_q = _mm256_srli_epi32(inf_u_q, 16); + v = _mm256_add_epi32(inf_u_q, v); + + w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2); + if (!_mm256_testz_si256(w0, w0)) { + return false; + } + + _mm256_storeu_si256((__m256i*)(vp + v_n_size), v); + } + } + + ui32 *vp = v_n_scratch; + ui16 *sp = scratch + (y >> 1) * sstr; + ui32 *dp = decoded_data + y * stride; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) { + //process two quads + __m128i vn = _mm_set1_epi32(2); + + __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp)); + inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1)); + + __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size))); + U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1)); + + __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn); + row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row)); + _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1)); + + __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp); + w0 = _mm_or_si128(w0, vn); + _mm_storeu_si128((__m128i*)vp, w0); + } + } + } + else { + + // reduce bitplane by 16 because we now have 16 bits instead of 32 + p -= 16; + + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We may go beyond the last entry by up to 8 entries. + // Therefore we allocate additional 8 entries. + // There are two rows in this structure, the bottom + // row is used to store processed entries. + const int v_n_size = 512 + 16; + ui16 v_n_scratch[v_n_size] = {0}; // 1+ kB + ui32 v_n_scratch_32[v_n_size] = {0}; // 2+ kB + + frwd_struct magsgn; + frwd_init<0xFF>(&magsgn, coded_data, lcup - scup); + + { + ui16 *sp = scratch; + ui16 *vp = v_n_scratch; + ui32 *dp = decoded_data; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) { + ////process four quads + __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp); + __m128i U_q = _mm_srli_epi32(inf_u_q, 16); + __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2)); + if (!_mm_testz_si128(w, w)) { + return false; + } + + __m128i vn = _mm_set1_epi16(2); + __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn); + + w = _mm_cvtsi32_si128(*(unsigned short const*)(vp)); + _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn)); + + __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1)); + __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1)); + + _mm256_storeu_si256((__m256i*)dp, w0); + _mm256_storeu_si256((__m256i*)(dp + stride), w1); + } + } + + for (ui32 y = 2; y < height; y += 2) { + { + // perform 15 - count_leading_zeros(*vp) here + ui16 *vp = v_n_scratch; + ui32 *vp_32 = v_n_scratch_32; + + ui16* sp = scratch + (y >> 1) * sstr; + const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2); + const __m256i avx_31 = _mm256_set1_epi32(31); + const __m256i avx_f0 = _mm256_set1_epi32(0xF0); + const __m256i avx_1 = _mm256_set1_epi32(1); + const __m256i avx_0 = _mm256_setzero_si256(); + + for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) { + __m128i v = _mm_loadu_si128((__m128i*)vp); + __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1)); + v = _mm_or_si128(v, v_p1); + + __m256i v_avx = _mm256_cvtepu16_epi32(v); + v_avx = avx2_lzcnt_epi32(v_avx); + v_avx = _mm256_sub_epi32(avx_31, v_avx); + + __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp); + __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0); + __m256i w0 = _mm256_sub_epi32(gamma, avx_1); + gamma = _mm256_and_si256(gamma, w0); + gamma = _mm256_cmpeq_epi32(gamma, avx_0); + + v_avx = _mm256_andnot_si256(gamma, v_avx); + v_avx = _mm256_max_epi32(v_avx, avx_1); + + inf_u_q = _mm256_srli_epi32(inf_u_q, 16); + v_avx = _mm256_add_epi32(inf_u_q, v_avx); + + w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2); + if (!_mm256_testz_si256(w0, w0)) { + return false; + } + + _mm256_storeu_si256((__m256i*)vp_32, v_avx); + } + } + + ui16 *vp = v_n_scratch; + ui32* vp_32 = v_n_scratch_32; + ui16 *sp = scratch + (y >> 1) * sstr; + ui32 *dp = decoded_data + y * stride; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) { + ////process four quads + __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp); + __m128i U_q = _mm_loadu_si128((__m128i*)vp_32); + + __m128i vn = _mm_set1_epi16(2); + __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn); + + __m128i w = _mm_cvtsi32_si128(*(unsigned short const*)(vp)); + _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn)); + + __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1)); + __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1)); + + _mm256_storeu_si256((__m256i*)dp, w0); + _mm256_storeu_si256((__m256i*)(dp + stride), w1); + } + } + + // increase bitplane back by 16 because we need to process 32 bits + p += 16; + } + + if (num_passes > 1) + { + // We use scratch again, we can divide it into multiple regions + // sigma holds all the significant samples, and it cannot + // be modified after it is set. it will be used during the + // Magnitude Refinement Pass + ui16* const sigma = scratch; + + ui32 mstr = (width + 3u) >> 2; // divide by 4, since each + // ui16 contains 4 columns + mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8 + + // We re-arrange quad significance, where each 4 consecutive + // bits represent one quad, into column significance, where, + // each 4 consequtive bits represent one column of 4 rows + { + ui32 y; + + const __m128i mask_3 = _mm_set1_epi32(0x30); + const __m128i mask_C = _mm_set1_epi32(0xC0); + const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400); + for (y = 0; y < height; y += 4) + { + ui16* sp = scratch + (y >> 1) * sstr; + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2) + { + __m128i s0, s1, u3, uC, t0, t1; + + s0 = _mm_loadu_si128((__m128i*)(sp)); + u3 = _mm_and_si128(s0, mask_3); + u3 = _mm_srli_epi32(u3, 4); + uC = _mm_and_si128(s0, mask_C); + uC = _mm_srli_epi32(uC, 2); + t0 = _mm_or_si128(u3, uC); + + s1 = _mm_loadu_si128((__m128i*)(sp + sstr)); + u3 = _mm_and_si128(s1, mask_3); + u3 = _mm_srli_epi32(u3, 2); + uC = _mm_and_si128(s1, mask_C); + t1 = _mm_or_si128(u3, uC); + + __m128i r = _mm_or_si128(t0, t1); + r = _mm_shuffle_epi8(r, shuffle_mask); + + // _mm_storeu_si32 is not defined, so we use this workaround + _mm_store_ss((float*)dp, _mm_castsi128_ps(r)); + } + dp[0] = 0; // set an extra entry on the right with 0 + } + { + // reset one row after the codeblock + ui16* dp = sigma + (y >> 2) * mstr; + __m128i zero = _mm_setzero_si128(); + for (ui32 x = 0; x < width; x += 32, dp += 8) + _mm_store_si128((__m128i*)dp, zero); + dp[0] = 0; // set an extra entry on the right with 0 + } + } + + // We perform Significance Propagation Pass here + { + // This stores significance information of the previous + // 4 rows. Significance information in this array includes + // all signicant samples in bitplane p - 1; that is, + // significant samples for bitplane p (discovered during the + // cleanup pass and stored in sigma) and samples that have recently + // became significant (during the SPP) in bitplane p-1. + // We store enough for the widest row, containing 1024 columns, + // which is equivalent to 256 of ui16, since each stores 4 columns. + // We add an extra 8 entries, just in case we need more + ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes + + frwd_struct sigprop; + frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 pattern = 0xFFFFu; // a pattern needed samples + if (height - y < 4) { + pattern = 0x7777u; + if (height - y < 3) { + pattern = 0x3333u; + if (height - y < 2) + pattern = 0x1111u; + } + } + + // prev holds sign. info. for the previous quad, together + // with the rows on top of it and below it. + ui32 prev = 0; + ui16 *prev_sig = prev_row_sig; + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui32 *dpp = decoded_data + y * stride; + for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig) + { + // only rows and columns inside the stripe are included + si32 s = (si32)x + 4 - (si32)width; + s = ojph_max(s, 0); + pattern = pattern >> (s * 4); + + // We first find locations that need to be tested (potential + // SPP members); these location will end up in mbr + // In each iteration, we produce 16 bits because cwd can have + // up to 16 bits of significance information, followed by the + // corresponding 16 bits of sign information; therefore, it is + // sufficient to fetch 32 bit data per loop. + + // Althougth we are interested in 16 bits only, we load 32 bits. + // For the 16 bits we are producing, we need the next 4 bits -- + // We need data for at least 5 columns out of 8. + // Therefore loading 32 bits is easier than loading 16 bits + // twice. + ui32 ps = *(ui32*)prev_sig; + ui32 ns = *(ui32*)(cur_sig + mstr); + ui32 u = (ps & 0x88888888) >> 3; // the row on top + if (!stripe_causal) + u |= (ns & 0x11111111) << 3; // the row below + + ui32 cs = *(ui32*)cur_sig; + // vertical integration + ui32 mbr = cs; // this sig. info. + mbr |= (cs & 0x77777777) << 1; //above neighbors + mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors + mbr |= u; + // horizontal integration + ui32 t = mbr; + mbr |= t << 4; // neighbors on the left + mbr |= t >> 4; // neighbors on the right + mbr |= prev >> 12; // significance of previous group + + // remove outside samples, and already significant samples + mbr &= pattern; + mbr &= ~cs; + + // find samples that become significant during the SPP + ui32 new_sig = mbr; + if (new_sig) + { + __m128i cwd_vec = frwd_fetch<0>(&sigprop); + ui32 cwd = (ui32)_mm_extract_epi16(cwd_vec, 0); + + ui32 cnt = 0; + ui32 col_mask = 0xFu; + ui32 inv_sig = ~cs & pattern; + for (int i = 0; i < 16; i += 4, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan one column + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x33u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x76u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xECu << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xC8u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + } + + if (new_sig) + { + cwd |= (ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt); + + // Spread new_sig, such that each bit is in one byte with a + // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1 + __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig); + new_sig_vec = _mm_shuffle_epi8(new_sig_vec, + _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)); + new_sig_vec = _mm_and_si128(new_sig_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + new_sig_vec = _mm_cmpeq_epi8(new_sig_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + + // find cumulative sums + // to find which bit in cwd we should extract + __m128i inc_sum = new_sig_vec; // inclusive scan + inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1 + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8)); + cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8; + // exclusive scan + __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); + + // Spread cwd, such that each bit is in one byte + // with a value of 0 or 1. + cwd_vec = _mm_set1_epi16((si16)cwd); + cwd_vec = _mm_shuffle_epi8(cwd_vec, + _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)); + cwd_vec = _mm_and_si128(cwd_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + cwd_vec = _mm_cmpeq_epi8(cwd_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + cwd_vec = _mm_abs_epi8(cwd_vec); + + // Obtain bit from cwd_vec correspondig to ex_sum + // Basically, collect needed bits from cwd_vec + __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum); + + // load data and set spp coefficients + __m128i m = + _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0); + __m128i val = _mm_set1_epi32(3 << (p - 2)); + ui32 *dp = dpp; + for (int c = 0; c < 4; ++ c) { + __m128i s0, s0_ns, s0_val; + // load coefficients + s0 = _mm_load_si128((__m128i*)dp); + + // epi32 is -1 only for coefficient that + // are changed during the SPP + s0_ns = _mm_shuffle_epi8(new_sig_vec, m); + s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF)); + + // obtain sign for coefficients in SPP + s0_val = _mm_shuffle_epi8(v, m); + s0_val = _mm_slli_epi32(s0_val, 31); + s0_val = _mm_or_si128(s0_val, val); + s0_val = _mm_and_si128(s0_val, s0_ns); + + // update vector + s0 = _mm_or_si128(s0, s0_val); + // store coefficients + _mm_store_si128((__m128i*)dp, s0); + // prepare for next row + dp += stride; + m = _mm_add_epi32(m, _mm_set1_epi32(1)); + } + } + frwd_advance(&sigprop, cnt); + } + + new_sig |= cs; + *prev_sig = (ui16)(new_sig); + + // vertical integration for the new sig. info. + t = new_sig; + new_sig |= (t & 0x7777) << 1; //above neighbors + new_sig |= (t & 0xEEEE) >> 1; //below neighbors + // add sig. info. from the row on top and below + prev = new_sig | u; + // we need only the bits in 0xF000 + prev &= 0xF000; + } + } + } + + // We perform Magnitude Refinement Pass here + if (num_passes > 2) + { + rev_struct magref; + rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui32 *dpp = decoded_data + y * stride; + for (ui32 i = 0; i < width; i += 4, dpp += 4) + { + //Process one entry from sigma array at a time + // Each nibble (4 bits) in the sigma array represents 4 rows, + ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data + ui16 sig = *cur_sig++; // 16 bit that will be processed now + int total_bits = 0; + if (sig) // if any of the 32 bits are set + { + // We work on 4 rows, with 4 samples each, since + // data is 32 bit (4 bytes) + + // spread the 16 bits in sig to 0 or 1 bytes in sig_vec + __m128i sig_vec = _mm_set1_epi16((si16)sig); + sig_vec = _mm_shuffle_epi8(sig_vec, + _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)); + sig_vec = _mm_and_si128(sig_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + sig_vec = _mm_cmpeq_epi8(sig_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + sig_vec = _mm_abs_epi8(sig_vec); + + // find cumulative sums + // to find which bit in cwd we should extract + __m128i inc_sum = sig_vec; // inclusive scan + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8)); + total_bits = _mm_extract_epi16(inc_sum, 7) >> 8; + __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan + + // Spread the 16 bits in cwd to inverted 0 or 1 bytes in + // cwd_vec. Then, convert these to a form suitable + // for coefficient modifications; in particular, a value + // of 0 is presented as binary 11, and a value of 1 is + // represented as binary 01 + __m128i cwd_vec = _mm_set1_epi16((si16)cwd); + cwd_vec = _mm_shuffle_epi8(cwd_vec, + _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)); + cwd_vec = _mm_and_si128(cwd_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + cwd_vec = _mm_cmpeq_epi8(cwd_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1)); + cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec); + cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1)); + + // load data and insert the mrp bit + __m128i m = + _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0); + ui32 *dp = dpp; + for (int c = 0; c < 4; ++c) { + __m128i s0, s0_sig, s0_idx, s0_val; + // load coefficients + s0 = _mm_load_si128((__m128i*)dp); + // find significant samples in this row + s0_sig = _mm_shuffle_epi8(sig_vec, m); + s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128()); + // get MRP bit index, and MRP pattern + s0_idx = _mm_shuffle_epi8(ex_sum, m); + s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx); + // keep data from significant samples only + s0_val = _mm_andnot_si128(s0_sig, s0_val); + // move mrp bits to correct position, and employ + s0_val = _mm_slli_epi32(s0_val, (si32)p - 2); + s0 = _mm_xor_si128(s0, s0_val); + // store coefficients + _mm_store_si128((__m128i*)dp, s0); + // prepare for next row + dp += stride; + m = _mm_add_epi32(m, _mm_set1_epi32(1)); + } + } + // consume data according to the number of bits set + rev_advance_mrp(&magref, (ui32)total_bits); + } + } + } + } + + return true; + } + } +} diff --git a/src/core/coding/ojph_block_decoder_ssse3.cpp b/src/core/coding/ojph_block_decoder_ssse3.cpp index 99ae38c..9fa5800 100644 --- a/src/core/coding/ojph_block_decoder_ssse3.cpp +++ b/src/core/coding/ojph_block_decoder_ssse3.cpp @@ -1033,14 +1033,14 @@ namespace ojph { { OJPH_WARN(0x00010001, "A malformed codeblock that has more than " "one coding pass, but zero length for " - "2nd and potential 3rd pass"); + "2nd and potential 3rd pass."); num_passes = 1; } if (num_passes > 3) { OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " - "This codeblocks has %d passes", + "This codeblocks has %d passes.", num_passes); return false; } @@ -1052,7 +1052,7 @@ namespace ojph { insufficient_precision = true; OJPH_WARN(0x00010003, "32 bits are not enough to decode this " "codeblock. This message will not be " - "displayed again"); + "displayed again."); } return false; } @@ -1063,7 +1063,7 @@ namespace ojph { OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " "pass. The code can be modified to support " "this case. This message will not be " - "displayed again"); + "displayed again."); } return false; // 32 bits are not enough to decode this } @@ -1076,7 +1076,7 @@ namespace ojph { OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " "nor MagRef passes; both will be skipped. " "This message will not be displayed " - "again"); + "again."); } } } @@ -1086,7 +1086,7 @@ namespace ojph { if (lengths1 < 2) { - OJPH_WARN(0x00010006, "Wrong codeblock length"); + OJPH_WARN(0x00010006, "Wrong codeblock length."); return false; } @@ -1361,7 +1361,7 @@ namespace ojph { // quad 0 length len = uvlc_entry & 0x7; // quad 0 suffix length uvlc_entry >>= 3; - ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q + ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); sp[1] = u_q; u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q sp[3] = u_q; diff --git a/src/core/coding/ojph_block_encoder.cpp b/src/core/coding/ojph_block_encoder.cpp index 2023ef1..ffc9e8d 100644 --- a/src/core/coding/ojph_block_encoder.cpp +++ b/src/core/coding/ojph_block_encoder.cpp @@ -65,11 +65,12 @@ namespace ojph { static ui16 vlc_tbl1[2048] = { 0 }; //UVLC encoding - static int ulvc_cwd_pre[33]; - static int ulvc_cwd_pre_len[33]; - static int ulvc_cwd_suf[33]; - static int ulvc_cwd_suf_len[33]; - + const int num_uvlc_entries = 75; + struct uvlc_tbl_struct { + ui8 pre, pre_len, suf, suf_len, ext, ext_len; + }; + static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries]; + ///////////////////////////////////////////////////////////////////////// static bool vlc_init_tables() { @@ -194,23 +195,61 @@ namespace ojph { static bool uvlc_init_tables() { //code goes from 0 to 31, extension and 32 are not supported here - ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2; - ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4; - ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1; - ulvc_cwd_pre_len[2] = 2; - ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3; - ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0; - ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1; - ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0; - ulvc_cwd_suf_len[2] = 0; - ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1; + uvlc_tbl[0].pre = 0; + uvlc_tbl[0].pre_len = 0; + uvlc_tbl[0].suf = 0; + uvlc_tbl[0].suf_len = 0; + uvlc_tbl[0].ext = 0; + uvlc_tbl[0].ext_len = 0; + + uvlc_tbl[1].pre = 1; + uvlc_tbl[1].pre_len = 1; + uvlc_tbl[1].suf = 0; + uvlc_tbl[1].suf_len = 0; + uvlc_tbl[1].ext = 0; + uvlc_tbl[1].ext_len = 0; + + uvlc_tbl[2].pre = 2; + uvlc_tbl[2].pre_len = 2; + uvlc_tbl[2].suf = 0; + uvlc_tbl[2].suf_len = 0; + uvlc_tbl[2].ext = 0; + uvlc_tbl[2].ext_len = 0; + + uvlc_tbl[3].pre = 4; + uvlc_tbl[3].pre_len = 3; + uvlc_tbl[3].suf = 0; + uvlc_tbl[3].suf_len = 1; + uvlc_tbl[3].ext = 0; + uvlc_tbl[3].ext_len = 0; + + uvlc_tbl[4].pre = 4; + uvlc_tbl[4].pre_len = 3; + uvlc_tbl[4].suf = 1; + uvlc_tbl[4].suf_len = 1; + uvlc_tbl[4].ext = 0; + uvlc_tbl[4].ext_len = 0; + for (int i = 5; i < 33; ++i) { - ulvc_cwd_pre[i] = 0; - ulvc_cwd_pre_len[i] = 3; - ulvc_cwd_suf[i] = i-5; - ulvc_cwd_suf_len[i] = 5; + uvlc_tbl[i].pre = 0; + uvlc_tbl[i].pre_len = 3; + uvlc_tbl[i].suf = (ui8)(i - 5); + uvlc_tbl[i].suf_len = 5; + uvlc_tbl[i].ext = 0; + uvlc_tbl[i].ext_len = 0; } + + for (int i = 33; i < num_uvlc_entries; ++i) + { + uvlc_tbl[i].pre = 0; + uvlc_tbl[i].pre_len = 3; + uvlc_tbl[i].suf = (ui8)(28 + (i - 33) % 4); + uvlc_tbl[i].suf_len = 5; + uvlc_tbl[i].ext = (ui8)((i - 33) / 4); + uvlc_tbl[i].ext_len = 4; + } + return true; } @@ -440,6 +479,29 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline void + ms_encode64(ms_struct* msp, ui64 cwd, int cwd_len) + { + while (cwd_len > 0) + { + if (msp->pos >= msp->buf_size) + OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full"); + int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len); + msp->tmp |= (ui32)((cwd & ((1ULL << t) - 1)) << msp->used_bits); + msp->used_bits += t; + cwd >>= t; + cwd_len -= t; + if (msp->used_bits >= msp->max_bits) + { + msp->buf[msp->pos++] = (ui8)msp->tmp; + msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8; + msp->tmp = 0; + msp->used_bits = 0; + } + } + } + ////////////////////////////////////////////////////////////////////////// static inline void ms_terminate(ms_struct* msp) @@ -467,11 +529,11 @@ namespace ojph { // // ////////////////////////////////////////////////////////////////////////// - void ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes, - ui32 width, ui32 height, ui32 stride, - ui32* lengths, - ojph::mem_elastic_allocator *elastic, - ojph::coded_lists *& coded) + void ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded) { assert(num_passes == 1); (void)num_passes; //currently not used @@ -693,23 +755,23 @@ namespace ojph { if (u_q0 > 2 && u_q1 > 2) { - vlc_encode(&vlc, ulvc_cwd_pre[u_q0-2], ulvc_cwd_pre_len[u_q0-2]); - vlc_encode(&vlc, ulvc_cwd_pre[u_q1-2], ulvc_cwd_pre_len[u_q1-2]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q0-2], ulvc_cwd_suf_len[u_q0-2]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q1-2], ulvc_cwd_suf_len[u_q1-2]); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len); } else if (u_q0 > 2 && u_q1 > 0) { - vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]); + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); vlc_encode(&vlc, u_q1 - 1, 1); - vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); } else { - vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]); - vlc_encode(&vlc, ulvc_cwd_pre[u_q1], ulvc_cwd_pre_len[u_q1]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q1], ulvc_cwd_suf_len[u_q1]); + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len); } //prepare for next iteration @@ -910,10 +972,514 @@ namespace ojph { ms_encode(&ms, s[7] & ((1U<> 1) | ((rho[1] & 8) >> 2); + s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0; + e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0; + rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0; + } + } + + + terminate_mel_vlc(&mel, &vlc); + ms_terminate(&ms); + + //copy to elastic + lengths[0] = mel.pos + vlc.pos + ms.pos; + elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded); + memcpy(coded->buf, ms.buf, ms.pos); + memcpy(coded->buf + ms.pos, mel.buf, mel.pos); + memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos); + + // put in the interface locator word + ui32 num_bytes = mel.pos + vlc.pos; + coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4); + coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0; + coded->buf[lengths[0]-2] = + (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF)); + + coded->avail_size -= lengths[0]; + } + + ////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + ////////////////////////////////////////////////////////////////////////// + void ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded) + { + assert(num_passes == 1); + (void)num_passes; //currently not used + // 38 bits/sample + 1 color + 4 wavelet = 43 bits per sample. + // * 4096 samples / 8 bits per byte = 22016; then rounded up to the + // nearest 1 kB, givin 22528. This expanded further to take into + // consideration stuffing at a max rate of 16 bits per 15 bits + // (1 bit for every 15 bits of data); in reality, it is much smaller + // than this. + const int ms_size = (22528 * 16 + 14) / 15; //more than enough + ui8 ms_buf[ms_size]; + // For each quad, we need at most, 7 bits for VLC and 12 bits for UVLC. + // So we have 1024 quads * 19 / 8, which is 2432. This must be + // multiplied by 16 / 15 to accommodate stuffing. + // The mel is at most around 1 bit/quad, giving around 128 byte -- in + // practice there was on case where it got to 132 bytes. Even + // accounting for stuffing, it is smaller than 192. Therefore, + // 3072 is more than enough + const int mel_vlc_size = 3072; //more than enough + ui8 mel_vlc_buf[mel_vlc_size]; + const int mel_size = 192; + ui8 *mel_buf = mel_vlc_buf; + const int vlc_size = mel_vlc_size - mel_size; + ui8 *vlc_buf = mel_vlc_buf + mel_size; + + mel_struct mel; + mel_init(&mel, mel_size, mel_buf); + vlc_struct vlc; + vlc_init(&vlc, vlc_size, vlc_buf); + ms_struct ms; + ms_init(&ms, ms_size, ms_buf); + + ui32 p = 62 - missing_msbs; + + //e_val: E values for a line (these are the highest set bit) + //cx_val: is the context values + //Each byte stores the info for the 2 sample. For E, it is maximum + // of the two samples, while for cx, it is the OR of these two samples. + //The maximum is between the pixel at the bottom left of one quad + // and the bottom right of the earlier quad. The same is true for cx. + //For a 1024 pixels, we need 512 bytes, the 2 extra, + // one for the non-existing earlier quad, and one for beyond the + // the end + ui8 e_val[513]; + ui8 cx_val[513]; + ui8* lep = e_val; lep[0] = 0; + ui8* lcxp = cx_val; lcxp[0] = 0; + + //initial row of quads + int e_qmax[2] = {0,0}, e_q[8] = {0,0,0,0,0,0,0,0}; + int rho[2] = {0,0}; + int c_q0 = 0; + ui64 s[8] = {0,0,0,0,0,0,0,0}, val, t; + ui32 y = 0; + ui64 *sp = buf; + for (ui32 x = 0; x < width; x += 4) + { + //prepare two quads + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL; // 2 \mu_p + if (val) + { + rho[0] = 1; + e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = e_q[0]; + s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 2; + e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[1]); + s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 1 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 4; + e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[2]); + s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 8; + e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[3]); + s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + int Uq0 = ojph_max(e_qmax[0], 1); //kappa_q = 1 + int u_q0 = Uq0 - 1, u_q1 = 0; //kappa_q = 1 + + int eps0 = 0; + if (u_q0 > 0) + { + eps0 |= (e_q[0] == e_qmax[0]); + eps0 |= (e_q[1] == e_qmax[0]) << 1; + eps0 |= (e_q[2] == e_qmax[0]) << 2; + eps0 |= (e_q[3] == e_qmax[0]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++; + lep[0] = (ui8)e_q[3]; + lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++; + lcxp[0] = (ui8)((rho[0] & 8) >> 3); + + ui16 tuple0 = vlc_tbl0[(c_q0 << 8) + (rho[0] << 4) + eps0]; + vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7); + + if (c_q0 == 0) + mel_encode(&mel, rho[0] != 0); + + int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0; + ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m); + m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0; + ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m); + m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0; + ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m); + m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0; + ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m); + + if (x + 2 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] = 1; + e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = e_q[4]; + s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 2; + e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[5]); + s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 3 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 4; + e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[6]); + s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 8; + e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[7]); + s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + int c_q1 = (rho[0] >> 1) | (rho[0] & 1); + int Uq1 = ojph_max(e_qmax[1], 1); //kappa_q = 1 + u_q1 = Uq1 - 1; //kappa_q = 1 + + int eps1 = 0; + if (u_q1 > 0) + { + eps1 |= (e_q[4] == e_qmax[1]); + eps1 |= (e_q[5] == e_qmax[1]) << 1; + eps1 |= (e_q[6] == e_qmax[1]) << 2; + eps1 |= (e_q[7] == e_qmax[1]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++; + lep[0] = (ui8)e_q[7]; + lcxp[0] |= (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++; + lcxp[0] = (ui8)((rho[1] & 8) >> 3); + ui16 tuple1 = vlc_tbl0[(c_q1 << 8) + (rho[1] << 4) + eps1]; + vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7); + + if (c_q1 == 0) + mel_encode(&mel, rho[1] != 0); + + int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0; + ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m); + m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0; + ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m); + m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0; + ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m); + m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0; + ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m); + } + + if (u_q0 > 0 && u_q1 > 0) + mel_encode(&mel, ojph_min(u_q0, u_q1) > 2); + + if (u_q0 > 2 && u_q1 > 2) + { + vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].ext, uvlc_tbl[u_q0-2].ext_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].ext, uvlc_tbl[u_q1-2].ext_len); + } + else if (u_q0 > 2 && u_q1 > 0) + { + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, u_q1 - 1, 1); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len); + } + else + { + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len); + } + + //prepare for next iteration + c_q0 = (rho[1] >> 1) | (rho[1] & 1); + s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0; + e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0; + rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0; + } + + lep[1] = 0; + + for (y = 2; y < height; y += 2) + { + lep = e_val; + int max_e = ojph_max(lep[0], lep[1]) - 1; + lep[0] = 0; + lcxp = cx_val; + c_q0 = lcxp[0] + (lcxp[1] << 2); + lcxp[0] = 0; + + sp = buf + y * stride; + for (ui32 x = 0; x < width; x += 4) + { + //prepare two quads + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] = 1; + e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = e_q[0]; + s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 2; + e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[1]); + s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 1 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 4; + e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[2]); + s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 8; + e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[3]); + s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + int kappa = (rho[0] & (rho[0]-1)) ? ojph_max(1,max_e) : 1; + int Uq0 = ojph_max(e_qmax[0], kappa); + int u_q0 = Uq0 - kappa, u_q1 = 0; + + int eps0 = 0; + if (u_q0 > 0) + { + eps0 |= (e_q[0] == e_qmax[0]); + eps0 |= (e_q[1] == e_qmax[0]) << 1; + eps0 |= (e_q[2] == e_qmax[0]) << 2; + eps0 |= (e_q[3] == e_qmax[0]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++; + max_e = ojph_max(lep[0], lep[1]) - 1; + lep[0] = (ui8)e_q[3]; + lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++; + int c_q1 = lcxp[0] + (lcxp[1] << 2); + lcxp[0] = (ui8)((rho[0] & 8) >> 3); + ui16 tuple0 = vlc_tbl1[(c_q0 << 8) + (rho[0] << 4) + eps0]; + vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7); + + if (c_q0 == 0) + mel_encode(&mel, rho[0] != 0); + + int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0; + ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m); + m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0; + ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m); + m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0; + ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m); + m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0; + ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m); + + if (x + 2 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] = 1; + e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = e_q[4]; + s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 2; + e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[5]); + s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 3 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 4; + e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[6]); + s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 8; + e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[7]); + s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + kappa = (rho[1] & (rho[1]-1)) ? ojph_max(1,max_e) : 1; + c_q1 |= ((rho[0] & 4) >> 1) | ((rho[0] & 8) >> 2); + int Uq1 = ojph_max(e_qmax[1], kappa); + u_q1 = Uq1 - kappa; + + int eps1 = 0; + if (u_q1 > 0) + { + eps1 |= (e_q[4] == e_qmax[1]); + eps1 |= (e_q[5] == e_qmax[1]) << 1; + eps1 |= (e_q[6] == e_qmax[1]) << 2; + eps1 |= (e_q[7] == e_qmax[1]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++; + max_e = ojph_max(lep[0], lep[1]) - 1; + lep[0] = (ui8)e_q[7]; + lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++; + c_q0 = lcxp[0] + (lcxp[1] << 2); + lcxp[0] = (ui8)((rho[1] & 8) >> 3); + ui16 tuple1 = vlc_tbl1[(c_q1 << 8) + (rho[1] << 4) + eps1]; + vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7); + + if (c_q1 == 0) + mel_encode(&mel, rho[1] != 0); + + int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0; + ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m); + m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0; + ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m); + m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0; + ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m); + m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0; + ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m); + } + + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len); //prepare for next iteration c_q0 |= ((rho[1] & 4) >> 1) | ((rho[1] & 8) >> 2); diff --git a/src/core/coding/ojph_block_encoder.h b/src/core/coding/ojph_block_encoder.h index 0c4b926..72b3c0d 100644 --- a/src/core/coding/ojph_block_encoder.h +++ b/src/core/coding/ojph_block_encoder.h @@ -52,11 +52,25 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void - ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes, - ui32 width, ui32 height, ui32 stride, - ui32* lengths, - ojph::mem_elastic_allocator *elastic, - ojph::coded_lists *& coded); + ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded); + + void + ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded); + + void + ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs, + ui32 num_passes, ui32 width, ui32 height, + ui32 stride, ui32* lengths, + ojph::mem_elastic_allocator* elastic, + ojph::coded_lists*& coded); void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs, @@ -64,6 +78,9 @@ namespace ojph { ui32 stride, ui32* lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *& coded); + + bool initialize_block_encoder_tables_avx2(); + bool initialize_block_encoder_tables_avx512(); } } diff --git a/src/core/coding/ojph_block_encoder_avx2.cpp b/src/core/coding/ojph_block_encoder_avx2.cpp new file mode 100644 index 0000000..7624272 --- /dev/null +++ b/src/core/coding/ojph_block_encoder_avx2.cpp @@ -0,0 +1,1213 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2019, The University of New South Wales, Australia +// Copyright (c) 2024, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_block_encoder_avx2.cpp +//***************************************************************************/ + +#include +#include +#include +#include +#include + +#include "ojph_mem.h" +#include "ojph_arch.h" +#include "ojph_block_encoder.h" +#include "ojph_message.h" + +#ifdef OJPH_COMPILER_MSVC + #define likely(x) (x) + #define unlikely(x) (x) +#else + #define likely(x) __builtin_expect((x), 1) + #define unlikely(x) __builtin_expect((x), 0) +#endif + +namespace ojph { + namespace local { + + ///////////////////////////////////////////////////////////////////////// + // tables + ///////////////////////////////////////////////////////////////////////// + + //VLC encoding + // index is (c_q << 8) + (rho << 4) + eps + // data is (cwd << 8) + (cwd_len << 4) + eps + // table 0 is for the initial line of quads + static ui32 vlc_tbl0[2048]; + static ui32 vlc_tbl1[2048]; + + //UVLC encoding + static ui32 ulvc_cwd_pre[33]; + static int ulvc_cwd_pre_len[33]; + static ui32 ulvc_cwd_suf[33]; + static int ulvc_cwd_suf_len[33]; + + ///////////////////////////////////////////////////////////////////////// + static bool vlc_init_tables() + { + struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; }; + vlc_src_table tbl0[] = { + #include "table0.h" + }; + size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table); + + si32 pattern_popcnt[16]; + for (ui32 i = 0; i < 16; ++i) + pattern_popcnt[i] = (si32)population_count(i); + + vlc_src_table* src_tbl = tbl0; + ui32 *tgt_tbl = vlc_tbl0; + size_t tbl_size = tbl0_size; + for (int i = 0; i < 2048; ++i) + { + int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF; + if (((emb & rho) != emb) || (rho == 0 && c_q == 0)) + tgt_tbl[i] = 0; + else + { + vlc_src_table *best_entry = NULL; + if (emb) // u_off = 1 + { + int best_e_k = -1; + for (size_t j = 0; j < tbl_size; ++j) + { + if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho) + if (src_tbl[j].u_off == 1) + if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1) + { + //now we need to find the smallest cwd with the highest + // number of bits set in e_k + int ones_count = pattern_popcnt[src_tbl[j].e_k]; + if (ones_count >= best_e_k) + { + best_entry = src_tbl + j; + best_e_k = ones_count; + } + } + } + } + else // u_off = 0 + { + for (size_t j = 0; j < tbl_size; ++j) + { + if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho) + if (src_tbl[j].u_off == 0) + { + best_entry = src_tbl + j; + break; + } + } + } + assert(best_entry); + tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4) + + best_entry->e_k); + } + } + + vlc_src_table tbl1[] = { + #include "table1.h" + }; + size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table); + + src_tbl = tbl1; + tgt_tbl = vlc_tbl1; + tbl_size = tbl1_size; + for (int i = 0; i < 2048; ++i) + { + int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF; + if (((emb & rho) != emb) || (rho == 0 && c_q == 0)) + tgt_tbl[i] = 0; + else + { + vlc_src_table *best_entry = NULL; + if (emb) // u_off = 1 + { + int best_e_k = -1; + for (size_t j = 0; j < tbl_size; ++j) + { + if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho) + if (src_tbl[j].u_off == 1) + if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1) + { + //now we need to find the smallest cwd with the highest + // number of bits set in e_k + int ones_count = pattern_popcnt[src_tbl[j].e_k]; + if (ones_count >= best_e_k) + { + best_entry = src_tbl + j; + best_e_k = ones_count; + } + } + } + } + else // u_off = 0 + { + for (size_t j = 0; j < tbl_size; ++j) + { + if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho) + if (src_tbl[j].u_off == 0) + { + best_entry = src_tbl + j; + break; + } + } + } + assert(best_entry); + tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4) + + best_entry->e_k); + } + } + + + return true; + } + + ///////////////////////////////////////////////////////////////////////// + static bool uvlc_init_tables() + { + //code goes from 0 to 31, extension and 32 are not supported here + ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2; + ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4; + ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1; + ulvc_cwd_pre_len[2] = 2; + ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3; + ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0; + ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1; + ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0; + ulvc_cwd_suf_len[2] = 0; + ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1; + for (int i = 5; i < 33; ++i) + { + ulvc_cwd_pre[i] = 0; + ulvc_cwd_pre_len[i] = 3; + ulvc_cwd_suf[i] = (ui32)(i-5); + ulvc_cwd_suf_len[i] = 5; + } + return true; + } + + ///////////////////////////////////////////////////////////////////////// + static bool tables_initialized = false; + + ///////////////////////////////////////////////////////////////////////// + bool initialize_block_encoder_tables_avx2() { + if (!tables_initialized) { + memset(vlc_tbl0, 0, 2048 * sizeof(ui32)); + memset(vlc_tbl1, 0, 2048 * sizeof(ui32)); + tables_initialized = vlc_init_tables(); + tables_initialized = tables_initialized && uvlc_init_tables(); + } + return tables_initialized; + } + + ///////////////////////////////////////////////////////////////////////// + // + ///////////////////////////////////////////////////////////////////////// + struct mel_struct { + //storage + ui8* buf; //pointer to data buffer + ui32 pos; //position of next writing within buf + ui32 buf_size; //size of buffer, which we must not exceed + + // all these can be replaced by bytes + int remaining_bits; //number of empty bits in tmp + int tmp; //temporary storage of coded bits + int run; //number of 0 run + int k; //state + int threshold; //threshold where one bit must be coded + }; + + ////////////////////////////////////////////////////////////////////////// + static inline void + mel_init(mel_struct* melp, ui32 buffer_size, ui8* data) + { + melp->buf = data; + melp->pos = 0; + melp->buf_size = buffer_size; + melp->remaining_bits = 8; + melp->tmp = 0; + melp->run = 0; + melp->k = 0; + melp->threshold = 1; // this is 1 << mel_exp[melp->k]; + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + mel_emit_bit(mel_struct* melp, int v) + { + melp->tmp = (melp->tmp << 1) + v; + melp->remaining_bits--; + if (melp->remaining_bits == 0) { + melp->buf[melp->pos++] = (ui8)melp->tmp; + melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8); + melp->tmp = 0; + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + mel_encode(mel_struct* melp, bool bit) + { + //MEL exponent + static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5}; + + if (bit == false) { + ++melp->run; + if (melp->run >= melp->threshold) { + mel_emit_bit(melp, 1); + melp->run = 0; + melp->k = ojph_min(12, melp->k + 1); + melp->threshold = 1 << mel_exp[melp->k]; + } + } else { + mel_emit_bit(melp, 0); + int t = mel_exp[melp->k]; + while (t > 0) { + mel_emit_bit(melp, (melp->run >> --t) & 1); + } + melp->run = 0; + melp->k = ojph_max(0, melp->k - 1); + melp->threshold = 1 << mel_exp[melp->k]; + } + } + + ///////////////////////////////////////////////////////////////////////// + // + ///////////////////////////////////////////////////////////////////////// + struct vlc_struct { + //storage + ui8* buf; //pointer to data buffer + ui32 pos; //position of next writing within buf + ui32 buf_size; //size of buffer, which we must not exceed + + int used_bits; //number of occupied bits in tmp + ui64 tmp; //temporary storage of coded bits + bool last_greater_than_8F; //true if last byte us greater than 0x8F + }; + + ////////////////////////////////////////////////////////////////////////// + static inline void + vlc_init(vlc_struct* vlcp, ui32 buffer_size, ui8* data) + { + vlcp->buf = data + buffer_size - 1; //points to last byte + vlcp->pos = 1; //locations will be all -pos + vlcp->buf_size = buffer_size; + + vlcp->buf[0] = 0xFF; + vlcp->used_bits = 4; + vlcp->tmp = 0xF; + vlcp->last_greater_than_8F = true; + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + vlc_encode(vlc_struct* vlcp, ui32 cwd, int cwd_len) + { + vlcp->tmp |= (ui64)cwd << vlcp->used_bits; + vlcp->used_bits += cwd_len; + + while (vlcp->used_bits >= 8) { + ui8 tmp; + + if (unlikely(vlcp->last_greater_than_8F)) { + tmp = vlcp->tmp & 0x7F; + + if (likely(tmp != 0x7F)) { + tmp = vlcp->tmp & 0xFF; + *(vlcp->buf - vlcp->pos) = tmp; + vlcp->last_greater_than_8F = tmp > 0x8F; + vlcp->tmp >>= 8; + vlcp->used_bits -= 8; + } else { + *(vlcp->buf - vlcp->pos) = tmp; + vlcp->last_greater_than_8F = false; + vlcp->tmp >>= 7; + vlcp->used_bits -= 7; + } + + } else { + tmp = vlcp->tmp & 0xFF; + *(vlcp->buf - vlcp->pos) = tmp; + vlcp->last_greater_than_8F = tmp > 0x8F; + vlcp->tmp >>= 8; + vlcp->used_bits -= 8; + } + + vlcp->pos++; + } + } + + ////////////////////////////////////////////////////////////////////////// + // + ////////////////////////////////////////////////////////////////////////// + static inline void + terminate_mel_vlc(mel_struct* melp, vlc_struct* vlcp) + { + if (melp->run > 0) + mel_emit_bit(melp, 1); + + if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) { + *(vlcp->buf - vlcp->pos) = 0x7f; + vlcp->pos++; + vlcp->tmp >>= 7; + vlcp->used_bits -= 7; + } + + melp->tmp = melp->tmp << melp->remaining_bits; + int mel_mask = (0xFF << melp->remaining_bits) & 0xFF; + int vlc_mask = 0xFF >> (8 - vlcp->used_bits); + if ((mel_mask | vlc_mask) == 0) + return; //last mel byte cannot be 0xFF, since then + //melp->remaining_bits would be < 8 + if (melp->pos >= melp->buf_size) + OJPH_ERROR(0x00020003, "mel encoder's buffer is full"); + ui8 vlcp_tmp = (ui8)vlcp->tmp; + int fuse = melp->tmp | vlcp_tmp; + if ( ( ((fuse ^ melp->tmp) & mel_mask) + | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0 + && (fuse != 0xFF) && vlcp->pos > 1) + { + melp->buf[melp->pos++] = (ui8)fuse; + } + else + { + if (vlcp->pos >= vlcp->buf_size) + OJPH_ERROR(0x00020004, "vlc encoder's buffer is full"); + melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF + *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp; + vlcp->pos++; + } + } + +///////////////////////////////////////////////////////////////////////// +// +///////////////////////////////////////////////////////////////////////// + struct ms_struct { + //storage + ui8* buf; //pointer to data buffer + ui32 pos; //position of next writing within buf + ui32 buf_size; //size of buffer, which we must not exceed + + int max_bits; //maximum number of bits that can be store in tmp + int used_bits; //number of occupied bits in tmp + ui32 tmp; //temporary storage of coded bits + }; + + ////////////////////////////////////////////////////////////////////////// + static inline void + ms_init(ms_struct* msp, ui32 buffer_size, ui8* data) + { + msp->buf = data; + msp->pos = 0; + msp->buf_size = buffer_size; + msp->max_bits = 8; + msp->used_bits = 0; + msp->tmp = 0; + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + ms_encode(ms_struct* msp, ui64 cwd, int cwd_len) + { + while (cwd_len > 0) + { + if (msp->pos >= msp->buf_size) + OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full"); + int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len); + msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits; + msp->used_bits += t; + cwd >>= t; + cwd_len -= t; + if (msp->used_bits >= msp->max_bits) + { + msp->buf[msp->pos++] = (ui8)msp->tmp; + msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8; + msp->tmp = 0; + msp->used_bits = 0; + } + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + ms_terminate(ms_struct* msp) + { + if (msp->used_bits) + { + int t = msp->max_bits - msp->used_bits; //unused bits + msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits; + msp->used_bits += t; + if (msp->tmp != 0xFF) + { + if (msp->pos >= msp->buf_size) + OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full"); + msp->buf[msp->pos++] = (ui8)msp->tmp; + } + } + else if (msp->max_bits == 7) + msp->pos--; + } + +#define ZERO _mm256_setzero_si256() +#define ONE _mm256_set1_epi32(1) + +// https://stackoverflow.com/a/58827596 +inline __m256i avx2_lzcnt_epi32(__m256i v) { + // prevent value from being rounded up to the next power of two + v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB + + v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float + v = _mm256_srli_epi32(v, 23); // shift down the exponent + v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias + v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32 + + return v; +} + +inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) { + return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff)); +} + +static void proc_pixel(__m256i *src_vec, ui32 p, + __m256i *eq_vec, __m256i *s_vec, + __m256i &rho_vec, __m256i &e_qmax_vec) +{ + __m256i val_vec[4]; + __m256i _eq_vec[4]; + __m256i _s_vec[4]; + __m256i _rho_vec[4]; + + for (ui32 i = 0; i < 4; ++i) { + /* val = t + t; //multiply by 2 and get rid of sign */ + val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]); + + /* val >>= p; // 2 \mu_p + x */ + val_vec[i] = _mm256_srli_epi32(val_vec[i], (int)p); + + /* val &= ~1u; // 2 \mu_p */ + val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((int)~1u)); + + /* if (val) { */ + const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO); + + /* rho[i] = 1 << i; + * rho is processed below. + */ + + /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */ + val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE); + _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]); + _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]); + + /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]); + * e_qmax is processed below + */ + + /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */ + val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE); + _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31); + _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]); + + _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask); + _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask); + val_vec[i] = _mm256_srli_epi32(val_notmask, 31); + /* } */ + } + + const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + + /* Reorder from + * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7] + * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],.[1, 6], [1, 7] + * *_vec[2]:[0, 8], [0, 9], [0,10], [0,11], [0,12], [0,13], [0,14], [0,15] + * *_vec[3]:[1, 8], [1, 9], [1,10], [1,11], [1,12], [1,13], [1,14], [1,15] + * to + * *_vec[0]:[0, 0], [0, 2], [0, 4], [0, 6], [0, 8], [0,10], [0,12], [0,14] + * *_vec[1]:[1, 0], [1, 2], [1, 4], [1, 6], [1, 8], [1,10], [1,12], [1,14] + * *_vec[2]:[0, 1], [0, 3], [0, 5], [0, 7], [0, 9], [0,11], [0,13], [0,15] + * *_vec[3]:[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [1,11], [1,13], [1,15] + */ + __m256i tmp1, tmp2; + for (ui32 i = 0; i < 2; ++i) { + tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx); + tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx); + eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4)); + eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4)); + + tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx); + tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx); + s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4)); + s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4)); + + tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx); + tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx); + _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4)); + _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4)); + } + + e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]); + e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]); + e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]); + _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1); + _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2); + _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3); + rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]); + rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]); + rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]); +} + +/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...] + * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...] + * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...] + * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...] + * + * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31, + * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33] + * + * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35, + * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37] + * + * [..] + */ +static void rotate_matrix(__m256i *matrix) +{ + __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]); + __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]); + __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]); + __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]); + + matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2); + matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4); + matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2); + matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4); + + tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20); + matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31); + matrix[0] = tmp1; + + tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20); + matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31); + matrix[1] = tmp1; +} + +static void proc_ms_encode(ms_struct *msp, + __m256i &tuple_vec, + __m256i &uq_vec, + __m256i &rho_vec, + __m256i *s_vec) +{ + __m256i m_vec[4]; + + /* Prepare parameters for ms_encode */ + /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */ + auto tmp = _mm256_and_si256(tuple_vec, ONE); + tmp = _mm256_sub_epi32(uq_vec, tmp); + auto tmp1 = _mm256_and_si256(rho_vec, ONE); + auto mask = avx2_cmpneq_epi32(tmp1, ZERO); + m_vec[0] = _mm256_and_si256(mask, tmp); + + /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */ + tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2)); + tmp = _mm256_srli_epi32(tmp, 1); + tmp = _mm256_sub_epi32(uq_vec, tmp); + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2)); + mask = avx2_cmpneq_epi32(tmp1, ZERO); + m_vec[1] = _mm256_and_si256(mask, tmp); + + /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */ + tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4)); + tmp = _mm256_srli_epi32(tmp, 2); + tmp = _mm256_sub_epi32(uq_vec, tmp); + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4)); + mask = avx2_cmpneq_epi32(tmp1, ZERO); + m_vec[2] = _mm256_and_si256(mask, tmp); + + /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */ + tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8)); + tmp = _mm256_srli_epi32(tmp, 3); + tmp = _mm256_sub_epi32(uq_vec, tmp); + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8)); + mask = avx2_cmpneq_epi32(tmp1, ZERO); + m_vec[3] = _mm256_and_si256(mask, tmp); + + rotate_matrix(m_vec); + /* s_vec from + * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30] + * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30] + * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31] + * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31] + * to + * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7] + * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15] + * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23] + * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31] + */ + rotate_matrix(s_vec); + + ui32 cwd[8]; + int cwd_len[8]; + ui64 _cwd = 0; + int _cwd_len = 0; + + /* Each iteration process 8 bytes * 2 lines */ + for (ui32 i = 0; i < 4; ++i) { + /* cwd = s[i * 4 + 0] & ((1U << m) - 1) + * cwd_len = m + */ + _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]); + tmp = _mm256_sllv_epi32(ONE, m_vec[i]); + tmp = _mm256_sub_epi32(tmp, ONE); + tmp = _mm256_and_si256(tmp, s_vec[i]); + _mm256_storeu_si256((__m256i*)cwd, tmp); + + for (ui32 j = 0; j < 4; ++j) { + ui32 idx = j * 2; + _cwd = cwd[idx]; + _cwd_len = cwd_len[idx]; + _cwd |= ((ui64)cwd[idx + 1]) << _cwd_len; + _cwd_len += cwd_len[idx + 1]; + ms_encode(msp, _cwd, _cwd_len); + } + } +} + +static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec, + __m256i &e_qmax_vec) +{ + /* if (u_q[i] > 0) { + * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]); + * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1; + * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2; + * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3; + * } + */ + auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO); + + auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec); + auto eps_vec = _mm256_srli_epi32(mask, 31); + + mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec); + auto tmp = _mm256_srli_epi32(mask, 31); + tmp = _mm256_slli_epi32(tmp, 1); + eps_vec = _mm256_or_si256(eps_vec, tmp); + + mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec); + tmp = _mm256_srli_epi32(mask, 31); + tmp = _mm256_slli_epi32(tmp, 2); + eps_vec = _mm256_or_si256(eps_vec, tmp); + + mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec); + tmp = _mm256_srli_epi32(mask, 31); + tmp = _mm256_slli_epi32(tmp, 3); + eps_vec = _mm256_or_si256(eps_vec, tmp); + + return _mm256_and_si256(u_q_mask, eps_vec); +} + +static void update_lep(ui32 x, __m256i &prev_e_val_vec, + __m256i *eq_vec, __m256i *e_val_vec, + const __m256i left_shift) +{ + /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++; + * lep[0] = (ui8)e_q[3]; + * Compare e_q[1] with e_q[3] of the prevous round. + */ + auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift); + tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0); + prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0); + e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp); +} + + +static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec, + __m256i &rho_vec, __m256i *cx_val_vec, + const __m256i left_shift) +{ + /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++; + * lcxp[0] = (ui8)((rho[0] & 8) >> 3); + * Or (rho[0] & 2) and (rho[0] of the previous round & 8). + */ + auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift); + tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0); + prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0); + + tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8)); + tmp = _mm256_srli_epi32(tmp, 3); + + auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2)); + tmp1 = _mm256_srli_epi32(tmp1, 1); + cx_val_vec[x] = _mm256_or_si256(tmp, tmp1); +} + +static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec, + __m256i &eps_vec, ui32 *vlc_tbl) +{ + /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */ + auto tmp = _mm256_slli_epi32(cq_vec, 8); + auto tmp1 = _mm256_slli_epi32(rho_vec, 4); + tmp = _mm256_add_epi32(tmp, tmp1); + tmp = _mm256_add_epi32(tmp, eps_vec); + return _mm256_i32gather_epi32((const int *)vlc_tbl, tmp, 4); +} + +static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec, + const __m256i right_shift) +{ + ojph_unused(x); + ojph_unused(cx_val_vec); + ojph_unused(right_shift); + + /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */ + auto tmp = _mm256_srli_epi32(rho_vec, 1); + auto tmp1 = _mm256_and_si256(rho_vec, ONE); + return _mm256_or_si256(tmp, tmp1); +} + +static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec, + const __m256i right_shift) +{ + // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2)) + // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2)); + auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift); + auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift); + + tmp = _mm256_insert_epi64(tmp, _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3); + tmp = _mm256_slli_epi32(tmp, 2); + auto tmp1 = _mm256_insert_epi32(lcxp1_vec, _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7); + tmp = _mm256_add_epi32(tmp1, tmp); + + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4)); + tmp1 = _mm256_srli_epi32(tmp1, 1); + tmp = _mm256_or_si256(tmp, tmp1); + + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8)); + tmp1 = _mm256_srli_epi32(tmp1, 2); + + return _mm256_or_si256(tmp, tmp1); +} + +using fn_proc_cq = __m256i (*)(ui32, __m256i *, __m256i &, const __m256i); + +static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec, + __m256i &rho_vec, __m256i u_q_vec, ui32 ignore, + const __m256i right_shift) +{ + int32_t mel_need_encode[8]; + int32_t mel_need_encode2[8]; + int32_t mel_bit[8]; + int32_t mel_bit2[8]; + /* Prepare mel_encode params */ + /* if (c_q[i] == 0) { */ + _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO)); + /* mel_encode(&mel, rho[i] != 0); */ + _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31)); + /* } */ + + /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */ + auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift); + auto tmp1 = _mm256_min_epi32(u_q_vec, tmp); + _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31)); + + /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */ + auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO); + _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO))); + + ui32 i_max = 8 - (ignore / 2); + + for (ui32 i = 0; i < i_max; i += 2) { + if (mel_need_encode[i]) { + mel_encode(melp, mel_bit[i]); + } + + if (i + 1 < i_max) { + if (mel_need_encode[i + 1]) { + mel_encode(melp, mel_bit[i + 1]); + } + } + + if (mel_need_encode2[i]) { + mel_encode(melp, mel_bit2[i]); + } + } +} + +static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec, + __m256i &rho_vec, __m256i u_q_vec, ui32 ignore, + const __m256i right_shift) +{ + ojph_unused(u_q_vec); + ojph_unused(right_shift); + int32_t mel_need_encode[8]; + int32_t mel_bit[8]; + + /* Prepare mel_encode params */ + /* if (c_q[i] == 0) { */ + _mm256_storeu_si256((__m256i*)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO)); + /* mel_encode(&mel, rho[i] != 0); */ + _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31)); + /* } */ + + ui32 i_max = 8 - (ignore / 2); + + for (ui32 i = 0; i < i_max; ++i) { + if (mel_need_encode[i]) { + mel_encode(melp, mel_bit[i]); + } + } +} + +using fn_proc_mel_encode = void (*)(mel_struct *, __m256i &, __m256i &, + __m256i, ui32, const __m256i); + +static void proc_vlc_encode1(vlc_struct *vlcp, ui32 *tuple, + ui32 *u_q, ui32 ignore) +{ + ui32 i_max = 8 - (ignore / 2); + + for (ui32 i = 0; i < i_max; i += 2) { + /* 7 bits */ + ui32 val = tuple[i + 0] >> 4; + int size = tuple[i + 0] & 7; + + if (i + 1 < i_max) { + /* 7 bits */ + val |= (tuple[i + 1] >> 4) << size; + size += tuple[i + 1] & 7; + } + + if (u_q[i] > 2 && u_q[i + 1] > 2) { + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i] - 2]) << size; + size += ulvc_cwd_pre_len[u_q[i] - 2]; + + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size; + size += ulvc_cwd_pre_len[u_q[i + 1] - 2]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i] - 2]) << size; + size += ulvc_cwd_suf_len[u_q[i] - 2]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size; + size += ulvc_cwd_suf_len[u_q[i + 1] - 2]; + + } else if (u_q[i] > 2 && u_q[i + 1] > 0) { + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i]]) << size; + size += ulvc_cwd_pre_len[u_q[i]]; + + /* 1 bit */ + val |= (u_q[i + 1] - 1) << size; + size += 1; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i]]) << size; + size += ulvc_cwd_suf_len[u_q[i]]; + + } else { + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i]]) << size; + size += ulvc_cwd_pre_len[u_q[i]]; + + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i + 1]]) << size; + size += ulvc_cwd_pre_len[u_q[i + 1]]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i]]) << size; + size += ulvc_cwd_suf_len[u_q[i]]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i + 1]]) << size; + size += ulvc_cwd_suf_len[u_q[i + 1]]; + } + + vlc_encode(vlcp, val, size); + } +} + +static void proc_vlc_encode2(vlc_struct *vlcp, ui32 *tuple, + ui32 *u_q, ui32 ignore) +{ + ui32 i_max = 8 - (ignore / 2); + + for (ui32 i = 0; i < i_max; i += 2) { + /* 7 bits */ + ui32 val = tuple[i + 0] >> 4; + int size = tuple[i + 0] & 7; + + if (i + 1 < i_max) { + /* 7 bits */ + val |= (tuple[i + 1] >> 4) << size; + size += tuple[i + 1] & 7; + } + + /* 3 bits */ + val |= ulvc_cwd_pre[u_q[i]] << size; + size += ulvc_cwd_pre_len[u_q[i]]; + + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i + 1]]) << size; + size += ulvc_cwd_pre_len[u_q[i + 1]]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i + 0]]) << size; + size += ulvc_cwd_suf_len[u_q[i + 0]]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i + 1]]) << size; + size += ulvc_cwd_suf_len[u_q[i + 1]]; + + vlc_encode(vlcp, val, size); + } +} + +using fn_proc_vlc_encode = void (*)(vlc_struct *, ui32 *, ui32 *, ui32); + +void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs, + ui32 num_passes, ui32 _width, ui32 height, + ui32 stride, ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded) +{ + ojph_unused(num_passes); //currently not used + + ui32 width = (_width + 15) & ~15u; + ui32 ignore = width - _width; + const int ms_size = (16384 * 16 + 14) / 15; //more than enough + const int mel_vlc_size = 3072; //more than enough + const int mel_size = 192; + const int vlc_size = mel_vlc_size - mel_size; + + ui8 ms_buf[ms_size]; + ui8 mel_vlc_buf[mel_vlc_size]; + ui8 *mel_buf = mel_vlc_buf; + ui8 *vlc_buf = mel_vlc_buf + mel_size; + + mel_struct mel; + mel_init(&mel, mel_size, mel_buf); + vlc_struct vlc; + vlc_init(&vlc, vlc_size, vlc_buf); + ms_struct ms; + ms_init(&ms, ms_size, ms_buf); + + const ui32 p = 30 - missing_msbs; + + //e_val: E values for a line (these are the highest set bit) + //cx_val: is the context values + //Each byte stores the info for the 2 sample. For E, it is maximum + // of the two samples, while for cx, it is the OR of these two samples. + //The maximum is between the pixel at the bottom left of one quad + // and the bottom right of the earlier quad. The same is true for cx. + //For a 1024 pixels, we need 512 bytes, the 2 extra, + // one for the non-existing earlier quad, and one for beyond the + // the end + const __m256i right_shift = _mm256_set_epi32( + 0, 7, 6, 5, 4, 3, 2, 1 + ); + + const __m256i left_shift = _mm256_set_epi32( + 6, 5, 4, 3, 2, 1, 0, 7 + ); + + ui32 n_loop = (width + 15) / 16; + + __m256i e_val_vec[65]; + for (ui32 i = 0; i > 3); */ + __m256i tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8)); + cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3); + + prev_e_val_vec = ZERO; + prev_cx_val_vec = ZERO; + + ui32 *sp = buf + y * stride; + + /* 16 bytes per iteration */ + for (ui32 x = 0; x < n_loop; ++x) { + + /* t = sp[i]; */ + if ((x == (n_loop - 1)) && (_width % 16)) { + ui32 tmp_buf[16] = { 0 }; + memcpy(tmp_buf, sp, (_width % 16) * sizeof(ui32)); + src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf)); + src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8)); + if (y + 1 < height) { + memcpy(tmp_buf, sp + stride, (_width % 16) * sizeof(ui32)); + src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf)); + src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8)); + } + else { + src_vec[1] = ZERO; + src_vec[3] = ZERO; + } + } + else { + src_vec[0] = _mm256_loadu_si256((__m256i*)(sp)); + src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8)); + + if (y + 1 < height) { + src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride)); + src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride)); + } + else { + src_vec[1] = ZERO; + src_vec[3] = ZERO; + } + sp += 16; + } + + /* src_vec layout: + * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5],.[0, 6],.[0, 7] + * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5],.[1, 6],.[1, 7] + * src_vec[2]:[0, 8],[0, 9],[0,10],[0,11],[0,12],[0,13],.[0,14], [0,15] + * src_vec[3]:[1, 8],[1, 9],[1,10],[1,11],[1,12],[1,13],.[1,14], [1,15] + */ + __m256i rho_vec, e_qmax_vec; + proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec); + + // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1; + tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift); + tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7); + + auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]); + max_e_vec = _mm256_sub_epi32(max_e_vec, ONE); + + // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1; + tmp = _mm256_max_epi32(max_e_vec, ONE); + __m256i tmp1 = _mm256_sub_epi32(rho_vec, ONE); + tmp1 = _mm256_and_si256(rho_vec, tmp1); + + auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO); + auto kappa_vec1_ = _mm256_and_si256(cmp, ONE); + auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp); + const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_); + + /* cq[1 - 16] = cq_vec + * cq[0] = prev_cq_vec[0] + */ + tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift); + + auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift); + cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0); + prev_cq = (ui32)_mm256_extract_epi32(tmp, 7); + + update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift); + update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift); + + /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */ + /* u_q[i] = Uq[i] - kappa[i]; */ + auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec); + auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec); + + auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec); + __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl); + ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0; + + proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore, + right_shift); + + proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec); + + // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7); + // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7); + ui32 u_q[8]; + ui32 tuple[8]; + /* The tuple is scaled by 4 due to: + * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true); + * So in the vlc_encode, the tuple will only be scaled by 2. + */ + tuple_vec = _mm256_srli_epi32(tuple_vec, 4); + _mm256_storeu_si256((__m256i*)tuple, tuple_vec); + _mm256_storeu_si256((__m256i*)u_q, u_q_vec); + + proc_vlc_encode(&vlc, tuple, u_q, _ignore); + } + + tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift); + tmp = _mm256_slli_epi32(tmp, 2); + tmp = _mm256_add_epi32(tmp, cx_val_vec[0]); + prev_cq = (ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp)); + + proc_cq = proc_cq2; + vlc_tbl = vlc_tbl1; + proc_mel_encode = proc_mel_encode2; + proc_vlc_encode = proc_vlc_encode2; + } + + ms_terminate(&ms); + terminate_mel_vlc(&mel, &vlc); + + //copy to elastic + lengths[0] = mel.pos + vlc.pos + ms.pos; + elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded); + memcpy(coded->buf, ms.buf, ms.pos); + memcpy(coded->buf + ms.pos, mel.buf, mel.pos); + memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos); + + // put in the interface locator word + ui32 num_bytes = mel.pos + vlc.pos; + coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4); + coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0; + coded->buf[lengths[0]-2] = + (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF)); + + coded->avail_size -= lengths[0]; +} + +} /* namespace local */ +} /* namespace ojph */ diff --git a/src/core/coding/ojph_block_encoder_avx512.cpp b/src/core/coding/ojph_block_encoder_avx512.cpp index 5912b09..b35373a 100644 --- a/src/core/coding/ojph_block_encoder_avx512.cpp +++ b/src/core/coding/ojph_block_encoder_avx512.cpp @@ -64,8 +64,8 @@ namespace ojph { // index is (c_q << 8) + (rho << 4) + eps // data is (cwd << 8) + (cwd_len << 4) + eps // table 0 is for the initial line of quads - static ui32 vlc_tbl0[2048] = { 0 }; - static ui32 vlc_tbl1[2048] = { 0 }; + static ui32 vlc_tbl0[2048]; + static ui32 vlc_tbl1[2048]; //UVLC encoding static ui32 ulvc_cwd_pre[33]; @@ -218,18 +218,18 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - bool initialize_tables() { - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) { - bool result; - result = vlc_init_tables(); - result = result && uvlc_init_tables(); - return result; - } - return false; - } + static bool tables_initialized = false; ///////////////////////////////////////////////////////////////////////// - static bool tables_initialized = initialize_tables(); + bool initialize_block_encoder_tables_avx512() { + if (!tables_initialized) { + memset(vlc_tbl0, 0, 2048 * sizeof(ui32)); + memset(vlc_tbl1, 0, 2048 * sizeof(ui32)); + tables_initialized = vlc_init_tables(); + tables_initialized = tables_initialized && uvlc_init_tables(); + } + return tables_initialized; + } ///////////////////////////////////////////////////////////////////////// // @@ -377,6 +377,13 @@ namespace ojph { if (melp->run > 0) mel_emit_bit(melp, 1); + if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) { + *(vlcp->buf - vlcp->pos) = 0x7f; + vlcp->pos++; + vlcp->tmp >>= 7; + vlcp->used_bits -= 7; + } + melp->tmp = melp->tmp << melp->remaining_bits; int mel_mask = (0xFF << melp->remaining_bits) & 0xFF; int vlc_mask = 0xFF >> (8 - vlcp->used_bits); diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h index 947f25b..29ab7a5 100644 --- a/src/core/common/ojph_arch.h +++ b/src/core/common/ojph_arch.h @@ -166,6 +166,32 @@ namespace ojph { #endif } + ///////////////////////////////////////////////////////////////////////////// + static inline ui32 population_count64(ui64 val) + { + #if defined(OJPH_COMPILER_MSVC) \ + && (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) + return (ui32)__popcnt64(val); + #elif (defined OJPH_COMPILER_GNUC) + return (ui32)__builtin_popcountll(val); + #else + const ui64 k1 = 0x5555555555555555ull; + const ui64 k2 = 0x3333333333333333ull; + const ui64 k4 = 0x0F0F0F0F0F0F0F0Full; + const ui64 kf = 0x0101010101010101ull; + + // put count of each 2 bits into those 2 bits + val = val - ((val >> 1) & k1); + // put count of each 4 bits into those 4 bits + val = (val & k2) + ((val >> 2) & k2); + // put count of each 8 bits into those 8 bits + val = (val + (val >> 4)) & k4 ; + // returns 8 most significant bits of x + (x<<8) + (x<<16) + (x<<24) + ... + val = (val * kf) >> 56; + return (ui32) val; + #endif + } + ///////////////////////////////////////////////////////////////////////////// #ifdef OJPH_COMPILER_MSVC #pragma intrinsic(_BitScanReverse) @@ -188,6 +214,29 @@ namespace ojph { #endif } + ///////////////////////////////////////////////////////////////////////////// +#ifdef OJPH_COMPILER_MSVC + #pragma intrinsic(_BitScanReverse64) +#endif + static inline ui32 count_leading_zeros(ui64 val) + { + #ifdef OJPH_COMPILER_MSVC + unsigned long result = 0; + _BitScanReverse64(&result, val); + return 63 ^ (ui32)result; + #elif (defined OJPH_COMPILER_GNUC) + return (ui32)__builtin_clzll(val); + #else + val |= (val >> 1); + val |= (val >> 2); + val |= (val >> 4); + val |= (val >> 8); + val |= (val >> 16); + val |= (val >> 32); + return 64 - population_count64(val); + #endif + } + ///////////////////////////////////////////////////////////////////////////// #ifdef OJPH_COMPILER_MSVC #pragma intrinsic(_BitScanForward) @@ -237,9 +286,15 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// // constants //////////////////////////////////////////////////////////////////////////// - const ui32 byte_alignment = 64; // 64 bytes == 512 bits - const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment); - const ui32 object_alignment = 8; + #ifndef OJPH_EMSCRIPTEN + const ui32 byte_alignment = 64; // 64 bytes == 512 bits + const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment); + const ui32 object_alignment = 8; + #else + const ui32 byte_alignment = 16; // 16 bytes == 128 bits + const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment); + const ui32 object_alignment = 8; + #endif //////////////////////////////////////////////////////////////////////////// // templates for alignment @@ -247,17 +302,17 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// // finds the size such that it is a multiple of byte_alignment - template + template size_t calc_aligned_size(size_t size) { size = size * sizeof(T) + N - 1; size &= ~((1ULL << (31 - count_leading_zeros(N))) - 1); - size >>= (31 - count_leading_zeros(sizeof(T))); + size >>= (63 - count_leading_zeros((ui64)sizeof(T))); return size; } //////////////////////////////////////////////////////////////////////////// // moves the pointer to first address that is a multiple of byte_alignment - template + template inline T *align_ptr(T *ptr) { intptr_t p = reinterpret_cast(ptr); p += N - 1; diff --git a/src/core/common/ojph_codestream.h b/src/core/common/ojph_codestream.h index 5f6dcdb..f7a8065 100644 --- a/src/core/common/ojph_codestream.h +++ b/src/core/common/ojph_codestream.h @@ -57,10 +57,11 @@ namespace ojph { class param_siz; class param_cod; class param_qcd; + class param_nlt; class comment_exchange; class mem_fixed_allocator; struct point; - struct line_buf; + class line_buf; class outfile_base; class infile_base; @@ -318,7 +319,7 @@ namespace ojph { * @brief Returns the underlying SIZ marker segment object * * @return param_siz This object holds SIZ marker segment information, - * which are related to codestream dimensions, number + * which deals with codestream dimensions, number * of components, bit depth, ... etc. */ param_siz access_siz(); @@ -327,7 +328,7 @@ namespace ojph { * @brief Returns the underlying COD marker segment object * * @return param_cod This object holds COD marker segment information, - * which are related to coding parameters, such as + * which deals with coding parameters, such as * codeblock sizes, progression order, reversible, * ... etc. */ @@ -337,11 +338,20 @@ namespace ojph { * @brief Returns the underlying QCD marker segment object * * @return param_qcd This object holds QCD marker segment information, - * which are related to quantization parameters -- + * which deals with quantization parameters -- * quantization step size for each subband. */ param_qcd access_qcd(); + /** + * @brief Returns the underlying NLT marker segment object + * + * @return param_nlt This object holds NLT marker segment information, + * which deals with non-linearity point transformation + * for each component. + */ + param_nlt access_nlt(); + /** * @brief Query if the codestream extraction is planar or not. * See the documentation for ojph::codestream::set_planar() diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h index d7497cd..99897f3 100644 --- a/src/core/common/ojph_mem.h +++ b/src/core/common/ojph_mem.h @@ -132,9 +132,23 @@ namespace ojph { }; ///////////////////////////////////////////////////////////////////////////// - struct line_buf + class line_buf { - line_buf() : size(0), pre_size(0), i32(0) {} + public: + enum : ui32 { + LFT_UNDEFINED = 0x00, // Type is undefined/uninitialized + // These flags reflects data size in bytes + LFT_BYTE = 0x01, // Set when data is 1 byte + LFT_16BIT = 0x02, // Set when data is 2 bytes + LFT_32BIT = 0x04, // Set when data is 4 bytes + LFT_64BIT = 0x08, // Set when data is 8 bytes + LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding + // Not all combinations are useful + LFT_SIZE_MASK = 0x0F, // To extract data size + }; + + public: + line_buf() : size(0), pre_size(0), flags(LFT_UNDEFINED), i32(0) {} template void pre_alloc(mem_fixed_allocator *p, size_t num_ele, ui32 pre_size) @@ -153,9 +167,12 @@ namespace ojph { size_t size; ui32 pre_size; + ui32 flags; union { - si32* i32; - float* f32; + si32* i32; // 32bit integer type, used for lossless compression + si64* i64; // 64bit integer type, used for lossless compression + float* f32; // float type, used for lossy compression + void* p; // no type is associated with the pointer }; }; diff --git a/src/core/common/ojph_params.h b/src/core/common/ojph_params.h index 0dce0ce..602fd99 100644 --- a/src/core/common/ojph_params.h +++ b/src/core/common/ojph_params.h @@ -52,6 +52,7 @@ namespace ojph { struct param_qcd; struct param_qcc; struct param_cap; + struct param_nlt; class codestream; } @@ -131,6 +132,47 @@ namespace ojph { local::param_qcd* state; }; + /** + * @brief non-linearity point transformation object + * (implements NLT marker segment) + * + */ + class OJPH_EXPORT param_nlt + { + public: + enum special_comp_num : ui16 { ALL_COMPS = 65535 }; + public: + param_nlt(local::param_nlt* p) : state(p) {} + + /** + * @brief enables or disables type 3 nonlinearity for a component + * or the default setting + * + * If you think that you need type 3 nonlinearity for all components, + * call this function with comp_num set to 65535 and enable to true. + * + * @param comp_num: component number, or 65535 for the default setting + * @param enable: true to enable nlt type 3 for this component or the + default setting, false to disable nlt type 3. + */ + void set_type3_transformation(ui32 comp_num, bool enable); + + /** + * @brief get the state (enabled or disabled) of type 3 nonlinearity + * for a component or the default setting + * + * @param comp_num: component number, or 65535 for the default setting + * @param bit_depth: returns the bit depth of the component/default + * @param is_signed: returns true if the component/default is signed + * @return true if enabled or false if not. + */ + bool get_type3_transformation(ui32 comp_num, ui8& bit_depth, + bool& is_signed); + + private: + local::param_nlt* state; + }; + //////////////////////////////////////////////////////////////////////////// class OJPH_EXPORT comment_exchange { diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h index 593d4b7..00faf75 100644 --- a/src/core/common/ojph_version.h +++ b/src/core/common/ojph_version.h @@ -34,5 +34,5 @@ //***************************************************************************/ #define OPENJPH_VERSION_MAJOR 0 -#define OPENJPH_VERSION_MINOR 15 +#define OPENJPH_VERSION_MINOR 18 #define OPENJPH_VERSION_PATCH 0 diff --git a/src/core/others/ojph_mem.cpp b/src/core/others/ojph_mem.cpp index b70d51e..0bb0b5f 100644 --- a/src/core/others/ojph_mem.cpp +++ b/src/core/others/ojph_mem.cpp @@ -65,22 +65,42 @@ namespace ojph { f32 = p->post_alloc_data(size, pre_size); } + //////////////////////////////////////////////////////////////////////////// + template<> + void line_buf::finalize_alloc(mem_fixed_allocator *p) + { + assert(p != 0 && size != 0); + i64 = p->post_alloc_data(size, pre_size); + } + //////////////////////////////////////////////////////////////////////////// template<> void line_buf::wrap(si32 *buffer, size_t num_ele, ui32 pre_size) { - i32 = buffer; + this->i32 = buffer; this->size = num_ele; this->pre_size = pre_size; + this->flags = LFT_32BIT | LFT_REVERSIBLE; } //////////////////////////////////////////////////////////////////////////// template<> void line_buf::wrap(float *buffer, size_t num_ele, ui32 pre_size) { - f32 = buffer; + this->f32 = buffer; + this->size = num_ele; + this->pre_size = pre_size; + this->flags = LFT_32BIT; + } + + //////////////////////////////////////////////////////////////////////////// + template<> + void line_buf::wrap(si64 *buffer, size_t num_ele, ui32 pre_size) + { + this->i64 = buffer; this->size = num_ele; this->pre_size = pre_size; + this->flags = LFT_64BIT | LFT_REVERSIBLE; } //////////////////////////////////////////////////////////////////////////// diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index fb42a7d..a98b477 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -39,53 +39,66 @@ #include "ojph_defs.h" #include "ojph_arch.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include "ojph_colour_local.h" namespace ojph { + + // defined elsewhere + class line_buf; + namespace local { ////////////////////////////////////////////////////////////////////////// - void (*cnvrt_si32_to_si32_shftd) - (const si32 *sp, si32 *dp, int shift, ui32 width) = NULL; + void (*rev_convert) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) = NULL; + + ////////////////////////////////////////////////////////////////////////// + void (*rev_convert_nlt_type3) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float_shftd) (const si32 *sp, float *dp, float mul, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float) (const si32 *sp, float *dp, float mul, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*cnvrt_float_to_si32_shftd) (const float *sp, si32 *dp, float mul, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*cnvrt_float_to_si32) (const float *sp, si32 *dp, float mul, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*rct_forward) - (const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) = NULL; + (const line_buf* r, const line_buf* g, const line_buf* b, + line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*rct_backward) - (const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) = NULL; + (const line_buf* r, const line_buf* g, const line_buf* b, + line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*ict_forward) (const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*ict_backward) (const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// static bool colour_transform_functions_initialized = false; ////////////////////////////////////////////////////////////////////////// @@ -96,7 +109,8 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) - cnvrt_si32_to_si32_shftd = gen_cnvrt_si32_to_si32_shftd; + rev_convert = gen_rev_convert; + rev_convert_nlt_type3 = gen_rev_convert_nlt_type3; cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd; cnvrt_si32_to_float = gen_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd; @@ -125,9 +139,10 @@ namespace ojph { #ifndef OJPH_DISABLE_SSE2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) { + rev_convert = sse2_rev_convert; + rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3; cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd; cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32; - cnvrt_si32_to_si32_shftd = sse2_cnvrt_si32_to_si32_shftd; rct_forward = sse2_rct_forward; rct_backward = sse2_rct_backward; } @@ -148,7 +163,8 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { - cnvrt_si32_to_si32_shftd = avx2_cnvrt_si32_to_si32_shftd; + rev_convert = avx2_rev_convert; + rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3; rct_forward = avx2_rct_forward; rct_backward = avx2_rct_backward; } @@ -161,7 +177,9 @@ namespace ojph { #endif // !OJPH_DISABLE_SIMD #else // OJPH_ENABLE_WASM_SIMD - cnvrt_si32_to_si32_shftd = wasm_cnvrt_si32_to_si32_shftd; + + rev_convert = wasm_rev_convert; + rev_convert_nlt_type3 = wasm_rev_convert_nlt_type3; cnvrt_si32_to_float_shftd = wasm_cnvrt_si32_to_float_shftd; cnvrt_si32_to_float = wasm_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd; @@ -170,6 +188,7 @@ namespace ojph { rct_backward = wasm_rct_backward; ict_forward = wasm_ict_forward; ict_backward = wasm_ict_backward; + #endif // !OJPH_ENABLE_WASM_SIMD colour_transform_functions_initialized = true; @@ -193,11 +212,79 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void gen_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) { - for (ui32 i = width; i > 0; --i) - *dp++ = *sp++ + shift; + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + si32 s = (si32)shift; + for (ui32 i = width; i > 0; --i) + *dp++ = *sp++ + s; + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + for (ui32 i = width; i > 0; --i) + *dp++ = *sp++ + shift; + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + for (ui32 i = width; i > 0; --i) + *dp++ = (si32)(*sp++ + shift); + } + } + + ////////////////////////////////////////////////////////////////////////// + void gen_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + si32 s = (si32)shift; + for (ui32 i = width; i > 0; --i) { + const si32 v = *sp++; + *dp++ = v >= 0 ? v : (- v - s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + for (ui32 i = width; i > 0; --i) { + const si64 v = *sp++; + *dp++ = v >= 0 ? v : (- v - shift); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + for (ui32 i = width; i > 0; --i) { + const si64 v = *sp++; + *dp++ = (si32)(v >= 0 ? v : (- v - shift)); + } + } } ////////////////////////////////////////////////////////////////////////// @@ -233,26 +320,104 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void gen_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { - for (ui32 i = repeat; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - *y++ = (*r + (*g << 1) + *b) >> 2; - *cb++ = (*b++ - *g); - *cr++ = (*r++ - *g++); + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + for (ui32 i = repeat; i > 0; --i) + { + si32 rr = *rp++, gg = *gp++, bb = *bp++; + *yp++ = (rr + (gg << 1) + bb) >> 2; + *cbp++ = (bb - gg); + *crp++ = (rr - gg); + } + } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (ui32 i = repeat; i > 0; --i) + { + si64 rr = *rp++, gg = *gp++, bb = *bp++; + *yp++ = (rr + (gg << 1) + bb) >> 2; + *cbp++ = (bb - gg); + *crp++ = (rr - gg); + } } } ////////////////////////////////////////////////////////////////////////// - void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) + void gen_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { - for (ui32 i = repeat; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - *g = *y++ - ((*cb + *cr)>>2); - *b++ = *cb++ + *g; - *r++ = *cr++ + *g++; + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (ui32 i = repeat; i > 0; --i) + { + si32 yy = *yp++, cbb = *cbp++, crr = *crp++; + si32 gg = yy - ((cbb + crr) >> 2); + *rp++ = crr + gg; + *gp++ = gg; + *bp++ = cbb + gg; + } + } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (ui32 i = repeat; i > 0; --i) + { + si64 yy = *yp++, cbb = *cbp++, crr = *crp++; + si64 gg = yy - ((cbb + crr) >> 2); + *rp++ = (si32)(crr + gg); + *gp++ = (si32)gg; + *bp++ = (si32)(cbb + gg); + } } } diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h index 212848b..cc42aaa 100644 --- a/src/core/transform/ojph_colour.h +++ b/src/core/transform/ojph_colour.h @@ -40,14 +40,26 @@ #define OJPH_COLOR_H namespace ojph { + + // defined elsewhere + class line_buf; + namespace local { //////////////////////////////////////////////////////////////////////////// void init_colour_transform_functions(); //////////////////////////////////////////////////////////////////////////// - extern void (*cnvrt_si32_to_si32_shftd) - (const si32 *sp, si32 *dp, int shift, ui32 width); + extern void (*rev_convert) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); + + //////////////////////////////////////////////////////////////////////////// + extern void (*rev_convert_nlt_type3) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_si32_to_float_shftd) @@ -67,13 +79,13 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// extern void (*rct_forward) - (const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + (const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); //////////////////////////////////////////////////////////////////////////// extern void (*rct_backward) - (const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + (const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); //////////////////////////////////////////////////////////////////////////// extern void (*ict_forward) diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 60e20d6..05bff31 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -35,10 +35,12 @@ // Date: 11 October 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" #include "ojph_arch.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include @@ -46,61 +48,392 @@ namespace ojph { namespace local { + ///////////////////////////////////////////////////////////////////////// + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline + __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) + { + // note than m must be obtained using + // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt)); + __m256i x = _mm256_srli_epi64(a, amt); + x = _mm256_xor_si256(x, m); + __m256i result = _mm256_sub_epi64(x, m); + return result; + } + ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void avx2_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) { - __m256i sh = _mm256_set1_epi32(shift); - for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i sh = _mm256_set1_epi32((si32)shift); + for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + { + __m256i s = _mm256_loadu_si256((__m256i*)sp); + s = _mm256_add_epi32(s, sh); + _mm256_storeu_si256((__m256i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m256i sh = _mm256_set1_epi64x(shift); + for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + { + __m256i s, t; + s = _mm256_loadu_si256((__m256i*)sp); + + t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0)); + t = _mm256_add_epi64(t, sh); + _mm256_storeu_si256((__m256i*)dp, t); + + t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1)); + t = _mm256_add_epi64(t, sh); + _mm256_storeu_si256((__m256i*)dp + 1, t); + } + } + } + else { - __m256i s = _mm256_loadu_si256((__m256i*)sp); - s = _mm256_add_epi32(s, sh); - _mm256_storeu_si256((__m256i*)dp, s); + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + 0, (si64)ULLONG_MAX); + __m256i sh = _mm256_set1_epi64x(shift); + for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + { + __m256i s, t; + s = _mm256_loadu_si256((__m256i*)sp); + s = _mm256_add_epi64(s, sh); + + t = _mm256_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm256_and_si256(low_bits, t); + + s = _mm256_loadu_si256((__m256i*)sp + 1); + s = _mm256_add_epi64(s, sh); + + s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0)); + s = _mm256_andnot_si256(low_bits, s); + + t = _mm256_or_si256(s, t); + t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_si256((__m256i*)dp, t); + } } } ////////////////////////////////////////////////////////////////////////// - void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void avx2_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) { - for (int i = (repeat + 7) >> 3; i > 0; --i) + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i sh = _mm256_set1_epi32((si32)(-shift)); + __m256i zero = _mm256_setzero_si256(); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + __m256i s = _mm256_loadu_si256((__m256i*)sp); + __m256i c = _mm256_cmpgt_epi32(zero, s); // 0xFFFFFFFF for -ve val + __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value + v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only -shift-val + s = _mm256_andnot_si256(c, s); // keep only +ve or 0 + s = _mm256_or_si256(s, v_m_sh); // combine + _mm256_storeu_si256((__m256i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m256i sh = _mm256_set1_epi64x(-shift); + __m256i zero = _mm256_setzero_si256(); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + __m256i s, t, u0, u1, c, v_m_sh; + s = _mm256_loadu_si256((__m256i*)sp); + + t = _mm256_cmpgt_epi32(zero, s); // find -ve 32bit -1 + u0 = _mm256_unpacklo_epi32(s, t); // correct 64bit data + c = _mm256_unpacklo_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm256_sub_epi64(sh, u0); // - shift - value + v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value + u0 = _mm256_andnot_si256(c, u0); // keep only +ve or 0 + u0 = _mm256_or_si256(u0, v_m_sh); // combine + + u1 = _mm256_unpackhi_epi32(s, t); // correct 64bit data + c = _mm256_unpackhi_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm256_sub_epi64(sh, u1); // - shift - value + v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value + u1 = _mm256_andnot_si256(c, u1); // keep only +ve or 0 + u1 = _mm256_or_si256(u1, v_m_sh); // combine + + t = _mm256_permute2x128_si256(u0, u1, (2 << 4) | 0); + _mm256_storeu_si256((__m256i*)dp, t); + + t = _mm256_permute2x128_si256(u0, u1, (3 << 4) | 1); + _mm256_storeu_si256((__m256i*)dp + 1, t); + } + } + } + else { - __m256i mr = _mm256_load_si256((__m256i*)r); - __m256i mg = _mm256_load_si256((__m256i*)g); - __m256i mb = _mm256_load_si256((__m256i*)b); - __m256i t = _mm256_add_epi32(mr, mb); - t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1)); - _mm256_store_si256((__m256i*)y, _mm256_srai_epi32(t, 2)); - t = _mm256_sub_epi32(mb, mg); - _mm256_store_si256((__m256i*)cb, t); - t = _mm256_sub_epi32(mr, mg); - _mm256_store_si256((__m256i*)cr, t); - - r += 8; g += 8; b += 8; - y += 8; cb += 8; cr += 8; + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i sh = _mm256_set1_epi64x(-shift); + __m256i zero = _mm256_setzero_si256(); + __m256i half_mask = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + 0, (si64)ULLONG_MAX); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + __m256i s, t, p, n, m, tm; + s = _mm256_loadu_si256((__m256i*)sp); + + m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value + tm = _mm256_sub_epi64(sh, s); // - shift - value + n = _mm256_and_si256(m, tm); // -ve + p = _mm256_andnot_si256(m, s); // +ve + tm = _mm256_or_si256(n, p); + tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm256_and_si256(half_mask, tm); + + s = _mm256_loadu_si256((__m256i*)sp + 1); + m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value + tm = _mm256_sub_epi64(sh, s); // - shift - value + n = _mm256_and_si256(m, tm); // -ve + p = _mm256_andnot_si256(m, s); // +ve + tm = _mm256_or_si256(n, p); + tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0)); + tm = _mm256_andnot_si256(half_mask, tm); + + t = _mm256_or_si256(t, tm); + t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_si256((__m256i*)dp, t); + } } } ////////////////////////////////////////////////////////////////////////// - void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) + void avx2_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) + { + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) + { + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i mr = _mm256_load_si256((__m256i*)rp); + __m256i mg = _mm256_load_si256((__m256i*)gp); + __m256i mb = _mm256_load_si256((__m256i*)bp); + __m256i t = _mm256_add_epi32(mr, mb); + t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1)); + _mm256_store_si256((__m256i*)yp, _mm256_srai_epi32(t, 2)); + t = _mm256_sub_epi32(mb, mg); + _mm256_store_si256((__m256i*)cbp, t); + t = _mm256_sub_epi32(mr, mg); + _mm256_store_si256((__m256i*)crp, t); + + rp += 8; gp += 8; bp += 8; + yp += 8; cbp += 8; crp += 8; + } + } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i mr32 = _mm256_load_si256((__m256i*)rp); + __m256i mg32 = _mm256_load_si256((__m256i*)gp); + __m256i mb32 = _mm256_load_si256((__m256i*)bp); + __m256i mr, mg, mb, t; + mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0)); + mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0)); + mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0)); + + t = _mm256_add_epi64(mr, mb); + t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1)); + _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2)); + t = _mm256_sub_epi64(mb, mg); + _mm256_store_si256((__m256i*)cbp, t); + t = _mm256_sub_epi64(mr, mg); + _mm256_store_si256((__m256i*)crp, t); + + yp += 4; cbp += 4; crp += 4; + + mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1)); + mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1)); + mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1)); + + t = _mm256_add_epi64(mr, mb); + t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1)); + _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2)); + t = _mm256_sub_epi64(mb, mg); + _mm256_store_si256((__m256i*)cbp, t); + t = _mm256_sub_epi64(mr, mg); + _mm256_store_si256((__m256i*)crp, t); + + rp += 8; gp += 8; bp += 8; + yp += 4; cbp += 4; crp += 4; + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void avx2_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) { - for (int i = (repeat + 7) >> 3; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) + { + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i my = _mm256_load_si256((__m256i*)yp); + __m256i mcb = _mm256_load_si256((__m256i*)cbp); + __m256i mcr = _mm256_load_si256((__m256i*)crp); + + __m256i t = _mm256_add_epi32(mcb, mcr); + t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2)); + _mm256_store_si256((__m256i*)gp, t); + __m256i u = _mm256_add_epi32(mcb, t); + _mm256_store_si256((__m256i*)bp, u); + u = _mm256_add_epi32(mcr, t); + _mm256_store_si256((__m256i*)rp, u); + + yp += 8; cbp += 8; crp += 8; + rp += 8; gp += 8; bp += 8; + } + } + else { - __m256i my = _mm256_load_si256((__m256i*)y); - __m256i mcb = _mm256_load_si256((__m256i*)cb); - __m256i mcr = _mm256_load_si256((__m256i*)cr); - - __m256i t = _mm256_add_epi32(mcb, mcr); - t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2)); - _mm256_store_si256((__m256i*)g, t); - __m256i u = _mm256_add_epi32(mcb, t); - _mm256_store_si256((__m256i*)b, u); - u = _mm256_add_epi32(mcr, t); - _mm256_store_si256((__m256i*)r, u); - - y += 8; cb += 8; cr += 8; - r += 8; g += 8; b += 8; + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); + __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + 0, (si64)ULLONG_MAX); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i my, mcb, mcr, tr, tg, tb; + my = _mm256_load_si256((__m256i*)yp); + mcb = _mm256_load_si256((__m256i*)cbp); + mcr = _mm256_load_si256((__m256i*)crp); + + tg = _mm256_add_epi64(mcb, mcr); + tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2)); + tb = _mm256_add_epi64(mcb, tg); + tr = _mm256_add_epi64(mcr, tg); + + __m256i mr, mg, mb; + mr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0)); + mr = _mm256_and_si256(low_bits, mr); + mg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0)); + mg = _mm256_and_si256(low_bits, mg); + mb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0)); + mb = _mm256_and_si256(low_bits, mb); + + yp += 4; cbp += 4; crp += 4; + + my = _mm256_load_si256((__m256i*)yp); + mcb = _mm256_load_si256((__m256i*)cbp); + mcr = _mm256_load_si256((__m256i*)crp); + + tg = _mm256_add_epi64(mcb, mcr); + tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2)); + tb = _mm256_add_epi64(mcb, tg); + tr = _mm256_add_epi64(mcr, tg); + + tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0)); + tr = _mm256_andnot_si256(low_bits, tr); + mr = _mm256_or_si256(mr, tr); + mr = _mm256_permute4x64_epi64(mr, _MM_SHUFFLE(3, 1, 2, 0)); + + tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0)); + tg = _mm256_andnot_si256(low_bits, tg); + mg = _mm256_or_si256(mg, tg); + mg = _mm256_permute4x64_epi64(mg, _MM_SHUFFLE(3, 1, 2, 0)); + + tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0)); + tb = _mm256_andnot_si256(low_bits, tb); + mb = _mm256_or_si256(mb, tb); + mb = _mm256_permute4x64_epi64(mb, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)rp, mr); + _mm256_store_si256((__m256i*)gp, mg); + _mm256_store_si256((__m256i*)bp, mb); + + yp += 4; cbp += 4; crp += 4; + rp += 8; gp += 8; bp += 8; + } } } diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index 6ddf890..5eb8b74 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -65,8 +65,16 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void gen_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void gen_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, @@ -85,12 +93,14 @@ namespace ojph { ui32 width); ////////////////////////////////////////////////////////////////////////// - void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void gen_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void gen_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// void gen_ict_forward(const float *r, const float *g, const float *b, @@ -157,16 +167,26 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void sse2_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void sse2_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void sse2_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); + + ////////////////////////////////////////////////////////////////////////// + void sse2_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// // @@ -209,16 +229,26 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void avx2_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void avx2_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void avx2_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); + + ////////////////////////////////////////////////////////////////////////// + void avx2_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// // @@ -245,16 +275,26 @@ namespace ojph { ui32 width); ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void wasm_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void wasm_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void wasm_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// void wasm_ict_forward(const float *r, const float *g, const float *b, diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 4a3cb14..a529c66 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -35,10 +35,12 @@ // Date: 11 October 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" #include "ojph_arch.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include @@ -46,6 +48,207 @@ namespace ojph { namespace local { + ///////////////////////////////////////////////////////////////////////// + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) + { + // note than m must be obtained using + // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt)); + __m128i x = _mm_srli_epi64(a, amt); + x = _mm_xor_si128(x, m); + __m128i result = _mm_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero) + { + __m128i t; + t = _mm_cmplt_epi32(a, zero); // get -ve + t = _mm_unpacklo_epi32(a, t); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero) + { + __m128i t; + t = _mm_cmplt_epi32(a, zero); // get -ve + t = _mm_unpackhi_epi32(a, t); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m128i sh = _mm_set1_epi32((si32)shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128i s = _mm_loadu_si128((__m128i*)sp); + s = _mm_add_epi32(s, sh); + _mm_storeu_si128((__m128i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m128i zero = _mm_setzero_si128(); + __m128i sh = _mm_set1_epi64x(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128i s, t; + s = _mm_loadu_si128((__m128i*)sp); + + t = sse2_cvtlo_epi32_epi64(s, zero); + t = _mm_add_epi64(t, sh); + _mm_storeu_si128((__m128i*)dp, t); + + t = sse2_cvthi_epi32_epi64(s, zero); + t = _mm_add_epi64(t, sh); + _mm_storeu_si128((__m128i*)dp + 1, t); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX); + __m128i sh = _mm_set1_epi64x(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128i s, t; + s = _mm_loadu_si128((__m128i*)sp); + s = _mm_add_epi64(s, sh); + + t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm_and_si128(low_bits, t); + + s = _mm_loadu_si128((__m128i*)sp + 1); + s = _mm_add_epi64(s, sh); + + s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0)); + s = _mm_andnot_si128(low_bits, s); + + t = _mm_or_si128(s, t); + _mm_storeu_si128((__m128i*)dp, t); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m128i sh = _mm_set1_epi32((si32)(-shift)); + __m128i zero = _mm_setzero_si128(); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + __m128i s = _mm_loadu_si128((__m128i*)sp); + __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value + __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + s = _mm_andnot_si128(c, s); // keep only +ve or 0 + s = _mm_or_si128(s, v_m_sh); // combine + _mm_storeu_si128((__m128i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m128i sh = _mm_set1_epi64x(-shift); + __m128i zero = _mm_setzero_si128(); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + __m128i s, t, u, c, v_m_sh; + s = _mm_loadu_si128((__m128i*)sp); + + t = _mm_cmplt_epi32(s, zero); // find -ve 32bit -1 + u = _mm_unpacklo_epi32(s, t); // correct 64bit data + c = _mm_unpacklo_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm_sub_epi64(sh, u); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + u = _mm_andnot_si128(c, u); // keep only +ve or 0 + u = _mm_or_si128(u, v_m_sh); // combine + + _mm_storeu_si128((__m128i*)dp, u); + u = _mm_unpackhi_epi32(s, t); // correct 64bit data + c = _mm_unpackhi_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm_sub_epi64(sh, u); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + u = _mm_andnot_si128(c, u); // keep only +ve or 0 + u = _mm_or_si128(u, v_m_sh); // combine + + _mm_storeu_si128((__m128i*)dp + 1, u); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m128i sh = _mm_set1_epi64x(-shift); + __m128i zero = _mm_setzero_si128(); + __m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + __m128i s, t, p, n, m, tm; + s = _mm_loadu_si128((__m128i*)sp); + + tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value + m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b + tm = _mm_sub_epi64(sh, s); // - shift - value + n = _mm_and_si128(m, tm); // -ve + p = _mm_andnot_si128(m, s); // +ve + tm = _mm_or_si128(n, p); + tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm_and_si128(half_mask, tm); + + s = _mm_loadu_si128((__m128i*)sp + 1); + tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value + m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b + tm = _mm_sub_epi64(sh, s); // - shift - value + n = _mm_and_si128(m, tm); // -ve + p = _mm_andnot_si128(m, s); // +ve + tm = _mm_or_si128(n, p); + tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0)); + tm = _mm_andnot_si128(half_mask, tm); + + t = _mm_or_si128(t, tm); + _mm_storeu_si128((__m128i*)dp, t); + } + } + } + ////////////////////////////////////////////////////////////////////////// void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width) @@ -80,64 +283,200 @@ namespace ojph { _MM_SET_ROUNDING_MODE(rounding_mode); } - ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void sse2_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) { - __m128i sh = _mm_set1_epi32(shift); - for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - __m128i s = _mm_loadu_si128((__m128i*)sp); - s = _mm_add_epi32(s, sh); - _mm_storeu_si128((__m128i*)dp, s); - } - } + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i mr = _mm_load_si128((__m128i*)rp); + __m128i mg = _mm_load_si128((__m128i*)gp); + __m128i mb = _mm_load_si128((__m128i*)bp); + __m128i t = _mm_add_epi32(mr, mb); + t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1)); + _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2)); + t = _mm_sub_epi32(mb, mg); + _mm_store_si128((__m128i*)cbp, t); + t = _mm_sub_epi32(mr, mg); + _mm_store_si128((__m128i*)crp, t); - ////////////////////////////////////////////////////////////////////////// - void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) - { - for (int i = (repeat + 3) >> 2; i > 0; --i) + rp += 4; gp += 4; bp += 4; + yp += 4; cbp += 4; crp += 4; + } + } + else { - __m128i mr = _mm_load_si128((__m128i*)r); - __m128i mg = _mm_load_si128((__m128i*)g); - __m128i mb = _mm_load_si128((__m128i*)b); - __m128i t = _mm_add_epi32(mr, mb); - t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1)); - _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2)); - t = _mm_sub_epi32(mb, mg); - _mm_store_si128((__m128i*)cb, t); - t = _mm_sub_epi32(mr, mg); - _mm_store_si128((__m128i*)cr, t); - - r += 4; g += 4; b += 4; - y += 4; cb += 4; cr += 4; + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m128i zero = _mm_setzero_si128(); + __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i mr32 = _mm_load_si128((__m128i*)rp); + __m128i mg32 = _mm_load_si128((__m128i*)gp); + __m128i mb32 = _mm_load_si128((__m128i*)bp); + __m128i mr, mg, mb, t; + mr = sse2_cvtlo_epi32_epi64(mr32, zero); + mg = sse2_cvtlo_epi32_epi64(mg32, zero); + mb = sse2_cvtlo_epi32_epi64(mb32, zero); + + t = _mm_add_epi64(mr, mb); + t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1)); + _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2)); + t = _mm_sub_epi64(mb, mg); + _mm_store_si128((__m128i*)cbp, t); + t = _mm_sub_epi64(mr, mg); + _mm_store_si128((__m128i*)crp, t); + + yp += 2; cbp += 2; crp += 2; + + mr = sse2_cvthi_epi32_epi64(mr32, zero); + mg = sse2_cvthi_epi32_epi64(mg32, zero); + mb = sse2_cvthi_epi32_epi64(mb32, zero); + + t = _mm_add_epi64(mr, mb); + t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1)); + _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2)); + t = _mm_sub_epi64(mb, mg); + _mm_store_si128((__m128i*)cbp, t); + t = _mm_sub_epi64(mr, mg); + _mm_store_si128((__m128i*)crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 2; cbp += 2; crp += 2; + } } } ////////////////////////////////////////////////////////////////////////// - void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) + void sse2_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) { - for (int i = (repeat + 3) >> 2; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - __m128i my = _mm_load_si128((__m128i*)y); - __m128i mcb = _mm_load_si128((__m128i*)cb); - __m128i mcr = _mm_load_si128((__m128i*)cr); - - __m128i t = _mm_add_epi32(mcb, mcr); - t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2)); - _mm_store_si128((__m128i*)g, t); - __m128i u = _mm_add_epi32(mcb, t); - _mm_store_si128((__m128i*)b, u); - u = _mm_add_epi32(mcr, t); - _mm_store_si128((__m128i*)r, u); - - y += 4; cb += 4; cr += 4; - r += 4; g += 4; b += 4; + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i my = _mm_load_si128((__m128i*)yp); + __m128i mcb = _mm_load_si128((__m128i*)cbp); + __m128i mcr = _mm_load_si128((__m128i*)crp); + + __m128i t = _mm_add_epi32(mcb, mcr); + t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2)); + _mm_store_si128((__m128i*)gp, t); + __m128i u = _mm_add_epi32(mcb, t); + _mm_store_si128((__m128i*)bp, u); + u = _mm_add_epi32(mcr, t); + _mm_store_si128((__m128i*)rp, u); + + yp += 4; cbp += 4; crp += 4; + rp += 4; gp += 4; bp += 4; + } } - } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); + __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i my, mcb, mcr, tr, tg, tb; + my = _mm_load_si128((__m128i*)yp); + mcb = _mm_load_si128((__m128i*)cbp); + mcr = _mm_load_si128((__m128i*)crp); + + tg = _mm_add_epi64(mcb, mcr); + tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2)); + tb = _mm_add_epi64(mcb, tg); + tr = _mm_add_epi64(mcr, tg); + + __m128i mr, mg, mb; + mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0)); + mr = _mm_and_si128(low_bits, mr); + mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0)); + mg = _mm_and_si128(low_bits, mg); + mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0)); + mb = _mm_and_si128(low_bits, mb); + yp += 2; cbp += 2; crp += 2; + + my = _mm_load_si128((__m128i*)yp); + mcb = _mm_load_si128((__m128i*)cbp); + mcr = _mm_load_si128((__m128i*)crp); + + tg = _mm_add_epi64(mcb, mcr); + tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2)); + tb = _mm_add_epi64(mcb, tg); + tr = _mm_add_epi64(mcr, tg); + + tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0)); + tr = _mm_andnot_si128(low_bits, tr); + mr = _mm_or_si128(mr, tr); + tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0)); + tg = _mm_andnot_si128(low_bits, tg); + mg = _mm_or_si128(mg, tg); + tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0)); + tb = _mm_andnot_si128(low_bits, tb); + mb = _mm_or_si128(mb, tb); + + _mm_store_si128((__m128i*)rp, mr); + _mm_store_si128((__m128i*)gp, mg); + _mm_store_si128((__m128i*)bp, mb); + + yp += 2; cbp += 2; crp += 2; + rp += 4; gp += 4; bp += 4; + } + } + } } } diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 632a645..5bf6ccd 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -39,12 +39,164 @@ #include #include "ojph_defs.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include "ojph_colour_local.h" namespace ojph { namespace local { + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i32x4_splat((si32)shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s = wasm_v128_load(sp); + s = wasm_i32x4_add(s, sh); + wasm_v128_store(dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s, t; + s = wasm_v128_load(sp); + + t = wasm_i64x2_extend_low_i32x4(s); + t = wasm_i64x2_add(t, sh); + wasm_v128_store(dp, t); + + t = wasm_i64x2_extend_high_i32x4(s); + t = wasm_i64x2_add(t, sh); + wasm_v128_store(dp + 2, t); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s0, s1; + s0 = wasm_v128_load(sp); + s0 = wasm_i64x2_add(s0, sh); + s1 = wasm_v128_load(sp + 2); + s1 = wasm_i64x2_add(s1, sh); + s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2); + wasm_v128_store(dp, s0); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i32x4_splat((si32)(-shift)); + v128_t zero = wasm_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + v128_t s = wasm_v128_load(sp); + v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value + v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + s = wasm_v128_andnot(c, s); // keep only +ve or 0 + s = wasm_v128_or(s, v_m_sh); // combine + wasm_v128_store(dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(-shift); + v128_t zero = wasm_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + v128_t s, u, c, v_m_sh; + s = wasm_v128_load(sp); + + u = wasm_i64x2_extend_low_i32x4(s); + c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value + v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + u = wasm_v128_andnot(c, u); // keep only +ve or 0 + u = wasm_v128_or(u, v_m_sh); // combine + + wasm_v128_store(dp, u); + + u = wasm_i64x2_extend_high_i32x4(s); + c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value + v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + u = wasm_v128_andnot(c, u); // keep only +ve or 0 + u = wasm_v128_or(u, v_m_sh); // combine + + wasm_v128_store(dp + 2, u); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(-shift); + v128_t zero = wasm_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + v128_t s, t0, t1, p, n, m, tm; + s = wasm_v128_load(sp); + m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value + tm = wasm_i64x2_sub(sh, s); // - shift - value + n = wasm_v128_and(m, tm); // -ve + p = wasm_v128_andnot(m, s); // +ve + t0 = wasm_v128_or(n, p); + + s = wasm_v128_load(sp + 2); + m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value + tm = wasm_i64x2_sub(sh, s); // - shift - value + n = wasm_v128_and(m, tm); // -ve + p = wasm_v128_andnot(m, s); // +ve + t1 = wasm_v128_or(n, p); + + t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2); + wasm_v128_store(dp, t0); + } + } + } + ////////////////////////////////////////////////////////////////////////// void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width) @@ -108,62 +260,182 @@ namespace ojph { } } - ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void wasm_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) { - v128_t sh = wasm_i32x4_splat(shift); - for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - v128_t s = wasm_v128_load(sp); - s = wasm_i32x4_add(s, sh); - wasm_v128_store(dp, s); - } - } + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; - ////////////////////////////////////////////////////////////////////////// - void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) - { - for (int i = (repeat + 3) >> 2; i > 0; --i) + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t mr = wasm_v128_load(rp); + v128_t mg = wasm_v128_load(gp); + v128_t mb = wasm_v128_load(bp); + v128_t t = wasm_i32x4_add(mr, mb); + t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1)); + wasm_v128_store(yp, wasm_i32x4_shr(t, 2)); + t = wasm_i32x4_sub(mb, mg); + wasm_v128_store(cbp, t); + t = wasm_i32x4_sub(mr, mg); + wasm_v128_store(crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 4; cbp += 4; crp += 4; + } + } + else { - v128_t mr = wasm_v128_load(r); - v128_t mg = wasm_v128_load(g); - v128_t mb = wasm_v128_load(b); - v128_t t = wasm_i32x4_add(mr, mb); - t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1)); - wasm_v128_store(y, wasm_i32x4_shr(t, 2)); - t = wasm_i32x4_sub(mb, mg); - wasm_v128_store(cb, t); - t = wasm_i32x4_sub(mr, mg); - wasm_v128_store(cr, t); + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t mr32 = wasm_v128_load(rp); + v128_t mg32 = wasm_v128_load(gp); + v128_t mb32 = wasm_v128_load(bp); + v128_t mr, mg, mb, t; + mr = wasm_i64x2_extend_low_i32x4(mr32); + mg = wasm_i64x2_extend_low_i32x4(mg32); + mb = wasm_i64x2_extend_low_i32x4(mb32); + + t = wasm_i64x2_add(mr, mb); + t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1)); + wasm_v128_store(yp, wasm_i64x2_shr(t, 2)); + t = wasm_i64x2_sub(mb, mg); + wasm_v128_store(cbp, t); + t = wasm_i64x2_sub(mr, mg); + wasm_v128_store(crp, t); - r += 4; g += 4; b += 4; - y += 4; cb += 4; cr += 4; + yp += 2; cbp += 2; crp += 2; + + mr = wasm_i64x2_extend_high_i32x4(mr32); + mg = wasm_i64x2_extend_high_i32x4(mg32); + mb = wasm_i64x2_extend_high_i32x4(mb32); + + t = wasm_i64x2_add(mr, mb); + t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1)); + wasm_v128_store(yp, wasm_i64x2_shr(t, 2)); + t = wasm_i64x2_sub(mb, mg); + wasm_v128_store(cbp, t); + t = wasm_i64x2_sub(mr, mg); + wasm_v128_store(crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 2; cbp += 2; crp += 2; + } } } ////////////////////////////////////////////////////////////////////////// - void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) + void wasm_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) { - for (int i = (repeat + 3) >> 2; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - v128_t my = wasm_v128_load(y); - v128_t mcb = wasm_v128_load(cb); - v128_t mcr = wasm_v128_load(cr); + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t my = wasm_v128_load(yp); + v128_t mcb = wasm_v128_load(cbp); + v128_t mcr = wasm_v128_load(crp); - v128_t t = wasm_i32x4_add(mcb, mcr); - t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2)); - wasm_v128_store(g, t); - v128_t u = wasm_i32x4_add(mcb, t); - wasm_v128_store(b, u); - u = wasm_i32x4_add(mcr, t); - wasm_v128_store(r, u); + v128_t t = wasm_i32x4_add(mcb, mcr); + t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2)); + wasm_v128_store(gp, t); + v128_t u = wasm_i32x4_add(mcb, t); + wasm_v128_store(bp, u); + u = wasm_i32x4_add(mcr, t); + wasm_v128_store(rp, u); - y += 4; cb += 4; cr += 4; - r += 4; g += 4; b += 4; + yp += 4; cbp += 4; crp += 4; + rp += 4; gp += 4; bp += 4; + } + } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1; + my = wasm_v128_load(yp); + mcb = wasm_v128_load(cbp); + mcr = wasm_v128_load(crp); + + tg0 = wasm_i64x2_add(mcb, mcr); + tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2)); + tb0 = wasm_i64x2_add(mcb, tg0); + tr0 = wasm_i64x2_add(mcr, tg0); + + yp += 2; cbp += 2; crp += 2; + + my = wasm_v128_load(yp); + mcb = wasm_v128_load(cbp); + mcr = wasm_v128_load(crp); + + tg1 = wasm_i64x2_add(mcb, mcr); + tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2)); + tb1 = wasm_i64x2_add(mcb, tg1); + tr1 = wasm_i64x2_add(mcr, tg1); + + tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2); + tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2); + tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2); + + wasm_v128_store(rp, tr0); + wasm_v128_store(gp, tg0); + wasm_v128_store(bp, tb0); + + yp += 2; cbp += 2; crp += 2; + rp += 4; gp += 4; bp += 4; + } } } diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp index ee4bb08..c4313ab 100644 --- a/src/core/transform/ojph_transform.cpp +++ b/src/core/transform/ojph_transform.cpp @@ -45,7 +45,9 @@ #include "../codestream/ojph_params_local.h" namespace ojph { - struct line_buf; + + // defined elsewhere + class line_buf; namespace local { @@ -156,9 +158,9 @@ namespace ojph { #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) { - rev_vert_step = avx512_rev_vert_step; - rev_horz_ana = avx512_rev_horz_ana; - rev_horz_syn = avx512_rev_horz_syn; + // rev_vert_step = avx512_rev_vert_step; + // rev_horz_ana = avx512_rev_horz_ana; + // rev_horz_syn = avx512_rev_horz_syn; irv_vert_step = avx512_irv_vert_step; irv_vert_times_K = avx512_irv_vert_times_K; @@ -192,13 +194,14 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) ///////////////////////////////////////////////////////////////////////// - void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + static + void gen_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; si32* dst = aug->i32; const si32* src1 = sig->i32, * src2 = other->i32; @@ -243,9 +246,85 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + static + void gen_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + *src1++ + *src2++) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + *src1++ + *src2++) >> e; + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ += (*src1++ + *src2++) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (*src1++ + *src2++) >> e; + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b - (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b - (*src1++ + *src2++)) >> e; + } + else { // general case + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + gen_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + gen_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void gen_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { @@ -277,7 +356,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; // extension lp[-1] = lp[0]; @@ -319,11 +398,111 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } - - ////////////////////////////////////////////////////////////////////////// - void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + + ///////////////////////////////////////////////////////////////////////// + static + void gen_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + si64* dph = hdst->i64; + si64* dpl = ldst->i64; + si64* sp = src->i64; + ui32 w = width; + if (!even) + { + *dph++ = *sp++; --w; + } + for (; w > 1; w -= 2) + { + *dpl++ = *sp++; *dph++ = *sp++; + } + if (w) + { + *dpl++ = *sp++; --w; + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp + (even ? 1 : 0); + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + (sp[-1] + sp[0])) >> e; + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp -= (sp[-1] + sp[0]) >> e; + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b - (sp[-1] + sp[0])) >> e; + } + else { + // general case + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + gen_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + gen_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + static + void gen_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -337,7 +516,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; // extension oth[-1] = oth[0]; @@ -398,6 +577,105 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static + void gen_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth + (ev ? 0 : 1); + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + (sp[-1] + sp[0])) >> e; + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp += (sp[-1] + sp[0]) >> e; + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b - (sp[-1] + sp[0])) >> e; + } + else { + // general case + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + si64* sph = hsrc->i64; + si64* spl = lsrc->i64; + si64* dp = dst->i64; + ui32 w = width; + if (!even) + { + *dp++ = *sph++; --w; + } + for (; w > 1; w -= 2) + { + *dp++ = *spl++; *dp++ = *sph++; + } + if (w) + { + *dp++ = *spl++; --w; + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + gen_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + gen_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + ////////////////////////////////////////////////////////////////////////// void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, const line_buf* other, const line_buf* aug, diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h index 0e59632..f7576a1 100644 --- a/src/core/transform/ojph_transform.h +++ b/src/core/transform/ojph_transform.h @@ -42,7 +42,10 @@ #include "ojph_defs.h" namespace ojph { - struct line_buf; + + // defined elsewhere + class line_buf; + namespace local { union lifting_step; struct param_atk; diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp index 0856662..8838d18 100644 --- a/src/core/transform/ojph_transform_avx.cpp +++ b/src/core/transform/ojph_transform_avx.cpp @@ -61,6 +61,40 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline + void avx_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx_interleave32(float* dp, float* spl, float* sph, int width) + { + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + ////////////////////////////////////////////////////////////////////////// void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, const line_buf* other, const line_buf* aug, @@ -100,11 +134,11 @@ namespace ojph { { // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; float* sp = src->f32; int w = (int)width; - AVX_DEINTERLEAVE(dpl, dph, sp, w, even); + avx_deinterleave32(dpl, dph, sp, w); } // the actual horizontal transform @@ -235,10 +269,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - AVX_INTERLEAVE(dp, spl, sph, w, even); + avx_interleave32(dp, spl, sph, w); } } else { diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp index 847cd4c..1bc92e6 100644 --- a/src/core/transform/ojph_transform_avx2.cpp +++ b/src/core/transform/ojph_transform_avx2.cpp @@ -35,6 +35,7 @@ // Date: 28 August 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" @@ -52,13 +53,95 @@ namespace ojph { namespace local { ///////////////////////////////////////////////////////////////////////// - void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline + __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) + { + // note than m must be obtained using + // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt)); + __m256i x = _mm256_srli_epi64(a, amt); + x = _mm256_xor_si256(x, m); + __m256i result = _mm256_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_interleave32(float* dp, float* spl, float* sph, int width) + { + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_deinterleave64(double* dpl, double* dph, double* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m256d a = _mm256_load_pd(sp); + __m256d b = _mm256_load_pd(sp + 4); + __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0)); + __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1)); + __m256d e = _mm256_shuffle_pd(c, d, 0x0); + __m256d f = _mm256_shuffle_pd(c, d, 0xF); + _mm256_store_pd(dpl, e); + _mm256_store_pd(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_interleave64(double* dp, double* spl, double* sph, int width) + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m256d a = _mm256_load_pd(spl); + __m256d b = _mm256_load_pd(sph); + __m256d c = _mm256_unpacklo_pd(a, b); + __m256d d = _mm256_unpackhi_pd(a, b); + __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0)); + __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1)); + _mm256_store_pd(dp, e); + _mm256_store_pd(dp + 4, f); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void avx2_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m256i va = _mm256_set1_epi32(a); __m256i vb = _mm256_set1_epi32(b); @@ -181,19 +264,154 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + static + void avx2_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m256i vb = _mm256_set1_epi64x(b); + __m256i ve = _mm256_set1_epi64x(1LL << (63 - e)); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + else { // general case + // 64bit multiplication is not supported in avx2; + // in particular, _mm256_mullo_epi64. + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + avx2_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + avx2_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void avx2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { - // combine both lsrc and hsrc into dst + // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; - float* sp = src->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; int w = (int)width; - AVX_DEINTERLEAVE(dpl, dph, sp, w, even); + avx2_deinterleave32(dpl, dph, sp, w); } si32* hp = hdst->i32, * lp = ldst->i32; @@ -206,7 +424,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m256i va = _mm256_set1_epi32(a); __m256i vb = _mm256_set1_epi32(b); @@ -346,11 +564,181 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } + + ///////////////////////////////////////////////////////////////////////// + static + void avx2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // split src into ldst and hdst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)src->p; + int w = (int)width; + avx2_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m256i vb = _mm256_set1_epi64x(b); + __m256i ve = _mm256_set1_epi64x(1LL << (63 - e)); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else { + // general case + // 64bit multiplication is not supported in avx2; + // in particular, _mm256_mullo_epi64. + if (even) + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[0] + sp[1])) >> e; + else + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + avx2_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + avx2_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, - ui32 width, bool even) + static + void avx2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -364,7 +752,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m256i va = _mm256_set1_epi32(a); __m256i vb = _mm256_set1_epi32(b); @@ -499,11 +887,11 @@ namespace ojph { // combine both lsrc and hsrc into dst { - float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - AVX_INTERLEAVE(dp, spl, sph, w, even); + avx2_interleave32(dp, spl, sph, w); } } else { @@ -514,5 +902,174 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static + void avx2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m256i vb = _mm256_set1_epi64x(b); + __m256i ve = _mm256_set1_epi64x(1LL << (63 - e)); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else { + // general case + // 64bit multiplication is not supported in avx2; + // in particular, _mm_mullo_epi64. + if (ev) + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + else + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[0] + sp[1])) >> e; + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + avx2_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + avx2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + avx2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp index 504aa87..0e92230 100644 --- a/src/core/transform/ojph_transform_avx512.cpp +++ b/src/core/transform/ojph_transform_avx512.cpp @@ -54,8 +54,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// // We split multiples of 32 followed by multiples of 16, because // we assume byte_alignment == 64 - static void avx512_deinterleave(float* dpl, float* dph, float* sp, - int width, bool even) + static + void avx512_deinterleave32(float* dpl, float* dph, float* sp, int width) { __m512i idx1 = _mm512_set_epi32( 0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10, @@ -65,59 +65,33 @@ namespace ojph { 0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11, 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01 ); - if (even) + for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16) { - for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16) - { - __m512 a = _mm512_load_ps(sp); - __m512 b = _mm512_load_ps(sp + 16); - __m512 c = _mm512_permutex2var_ps(a, idx1, b); - __m512 d = _mm512_permutex2var_ps(a, idx2, b); - _mm512_store_ps(dpl, c); - _mm512_store_ps(dph, d); - } - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) - { - __m256 a = _mm256_load_ps(sp); - __m256 b = _mm256_load_ps(sp + 8); - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); - _mm256_store_ps(dpl, e); - _mm256_store_ps(dph, f); - } + __m512 a = _mm512_load_ps(sp); + __m512 b = _mm512_load_ps(sp + 16); + __m512 c = _mm512_permutex2var_ps(a, idx1, b); + __m512 d = _mm512_permutex2var_ps(a, idx2, b); + _mm512_store_ps(dpl, c); + _mm512_store_ps(dph, d); } - else + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) { - for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16) - { - __m512 a = _mm512_load_ps(sp); - __m512 b = _mm512_load_ps(sp + 16); - __m512 c = _mm512_permutex2var_ps(a, idx2, b); - __m512 d = _mm512_permutex2var_ps(a, idx1, b); - _mm512_store_ps(dpl, c); - _mm512_store_ps(dph, d); - } - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) - { - __m256 a = _mm256_load_ps(sp); - __m256 b = _mm256_load_ps(sp + 8); - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); - _mm256_store_ps(dpl, f); - _mm256_store_ps(dph, e); - } + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); } } ////////////////////////////////////////////////////////////////////////// // We split multiples of 32 followed by multiples of 16, because // we assume byte_alignment == 64 - static void avx512_interleave(float* dp, float* spl, float* sph, - int width, bool even) + static + void avx512_interleave32(float* dp, float* spl, float* sph, int width) { __m512i idx1 = _mm512_set_epi32( 0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4, @@ -127,51 +101,93 @@ namespace ojph { 0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC, 0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8 ); - if (even) + for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16) { - for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16) - { - __m512 a = _mm512_load_ps(spl); - __m512 b = _mm512_load_ps(sph); - __m512 c = _mm512_permutex2var_ps(a, idx1, b); - __m512 d = _mm512_permutex2var_ps(a, idx2, b); - _mm512_store_ps(dp, c); - _mm512_store_ps(dp + 16, d); - } - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) - { - __m256 a = _mm256_load_ps(spl); - __m256 b = _mm256_load_ps(sph); - __m256 c = _mm256_unpacklo_ps(a, b); - __m256 d = _mm256_unpackhi_ps(a, b); - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); - _mm256_store_ps(dp, e); - _mm256_store_ps(dp + 8, f); - } + __m512 a = _mm512_load_ps(spl); + __m512 b = _mm512_load_ps(sph); + __m512 c = _mm512_permutex2var_ps(a, idx1, b); + __m512 d = _mm512_permutex2var_ps(a, idx2, b); + _mm512_store_ps(dp, c); + _mm512_store_ps(dp + 16, d); } - else + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) { - for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16) - { - __m512 a = _mm512_load_ps(spl); - __m512 b = _mm512_load_ps(sph); - __m512 c = _mm512_permutex2var_ps(b, idx1, a); - __m512 d = _mm512_permutex2var_ps(b, idx2, a); - _mm512_store_ps(dp, c); - _mm512_store_ps(dp + 16, d); - } - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) - { - __m256 a = _mm256_load_ps(spl); - __m256 b = _mm256_load_ps(sph); - __m256 c = _mm256_unpacklo_ps(b, a); - __m256 d = _mm256_unpackhi_ps(b, a); - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); - _mm256_store_ps(dp, e); - _mm256_store_ps(dp + 8, f); - } + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + // We split multiples of 32 followed by multiples of 16, because + // we assume byte_alignment == 64 + static void avx512_deinterleave64(double* dpl, double* dph, double* sp, + int width) + { + __m512i idx1 = _mm512_set_epi64( + 0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00 + ); + __m512i idx2 = _mm512_set_epi64( + 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01 + ); + for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m512d a = _mm512_load_pd(sp); + __m512d b = _mm512_load_pd(sp + 16); + __m512d c = _mm512_permutex2var_pd(a, idx1, b); + __m512d d = _mm512_permutex2var_pd(a, idx2, b); + _mm512_store_pd(dpl, c); + _mm512_store_pd(dph, d); + } + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m256d a = _mm256_load_pd(sp); + __m256d b = _mm256_load_pd(sp + 4); + __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0)); + __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1)); + __m256d e = _mm256_shuffle_pd(c, d, 0x0); + __m256d f = _mm256_shuffle_pd(c, d, 0xF); + _mm256_store_pd(dpl, e); + _mm256_store_pd(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + // We split multiples of 32 followed by multiples of 16, because + // we assume byte_alignment == 64 + static void avx512_interleave64(double* dp, double* spl, double* sph, + int width) + { + __m512i idx1 = _mm512_set_epi64( + 0xB, 0x3, 0xA, 0x2, 0x9, 0x1, 0x8, 0x0 + ); + __m512i idx2 = _mm512_set_epi64( + 0xF, 0x7, 0xE, 0x6, 0xD, 0x5, 0xC, 0x4 + ); + for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m512d a = _mm512_load_pd(spl); + __m512d b = _mm512_load_pd(sph); + __m512d c = _mm512_permutex2var_pd(a, idx1, b); + __m512d d = _mm512_permutex2var_pd(a, idx2, b); + _mm512_store_pd(dp, c); + _mm512_store_pd(dp + 16, d); + } + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m256d a = _mm256_load_pd(spl); + __m256d b = _mm256_load_pd(sph); + __m256d c = _mm256_unpacklo_pd(a, b); + __m256d d = _mm256_unpackhi_pd(a, b); + __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0)); + __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1)); + _mm256_store_pd(dp, e); + _mm256_store_pd(dp + 4, f); } } @@ -224,7 +240,13 @@ namespace ojph { if (width > 1) { // split src into ldst and hdst - avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + avx512_deinterleave32(dpl, dph, sp, w); + } // the actual horizontal transform float* hp = hdst->f32, * lp = ldst->f32; @@ -352,7 +374,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + avx512_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -364,13 +392,13 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// - void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + void avx512_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m512i va = _mm512_set1_epi32(a); __m512i vb = _mm512_set1_epi32(b); @@ -493,14 +521,185 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + void avx512_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m512i vb = _mm512_set1_epi64(b); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + else + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + else + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + else + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + } + else { + // general case + // 64bit multiplication is not supported in AVX512F + AVX512CD; + // in particular, _mm256_mullo_epi64. + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + + // This can only be used if you have AVX512DQ + // { // general case + // __m512i va = _mm512_set1_epi64(a); + // int i = (int)repeat; + // if (synthesis) + // for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)src1); + // __m512i s2 = _mm512_load_si512((__m512i*)src2); + // __m512i d = _mm512_load_si512((__m512i*)dst); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_sub_epi64(d, w); + // _mm512_store_si512((__m512i*)dst, d); + // } + // else + // for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)src1); + // __m512i s2 = _mm512_load_si512((__m512i*)src2); + // __m512i d = _mm512_load_si512((__m512i*)dst); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_add_epi64(d, w); + // _mm512_store_si512((__m512i*)dst, d); + // } + // } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + avx512_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + avx512_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { - // combine both lsrc and hsrc into dst - avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + // split src into ldst and hdst + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + avx512_deinterleave32(dpl, dph, sp, w); + } si32* hp = hdst->i32, * lp = ldst->i32; ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass @@ -512,7 +711,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m512i va = _mm512_set1_epi32(a); __m512i vb = _mm512_set1_epi32(b); @@ -653,10 +852,211 @@ namespace ojph { } } - ////////////////////////////////////////////////////////////////////////// - void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // split src into ldst and hdst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)(src->p); + int w = (int)width; + avx512_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m512i vb = _mm512_set1_epi64(b); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + // general case + // 64bit multiplication is not supported in AVX512F + AVX512CD; + // in particular, _mm256_mullo_epi64. + if (even) + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[0] + sp[1])) >> e; + else + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // This can only be used if you have AVX512DQ + // { + // // general case + // __m512i va = _mm512_set1_epi64(a); + // int i = (int)h_width; + // if (even) + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_add_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // else + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_add_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + avx512_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + avx512_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -670,7 +1070,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m512i va = _mm512_set1_epi32(a); __m512i vb = _mm512_set1_epi32(b); @@ -804,7 +1204,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + avx512_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -814,5 +1220,206 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m512i vb = _mm512_set1_epi64(b); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + // general case + // 64bit multiplication is not supported in AVX512F + AVX512CD; + // in particular, _mm256_mullo_epi64. + if (ev) + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + else + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[0] + sp[1])) >> e; + } + + // This can only be used if you have AVX512DQ + // { + // // general case + // __m512i va = _mm512_set1_epi64(a); + // int i = (int)aug_width; + // if (ev) + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_sub_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // else + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_sub_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)(dst->p); + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + avx512_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + avx512_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + avx512_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h index ec2a2e1..acf9ee6 100644 --- a/src/core/transform/ojph_transform_local.h +++ b/src/core/transform/ojph_transform_local.h @@ -42,7 +42,10 @@ #include "ojph_defs.h" namespace ojph { - struct line_buf; + + // defined elsewhere + class line_buf; + namespace local { struct param_atk; union lifting_step; @@ -104,60 +107,6 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - // Supporting macros - ////////////////////////////////////////////////////////////////////////// - - ////////////////////////////////////////////////////////////////////////// - #define SSE_DEINTERLEAVE(dpl, dph, sp, width, even) \ - { \ - if (even) \ - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) \ - { \ - __m128 a = _mm_load_ps(sp); \ - __m128 b = _mm_load_ps(sp + 4); \ - __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm_store_ps(dpl, c); \ - _mm_store_ps(dph, d); \ - } \ - else \ - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) \ - { \ - __m128 a = _mm_load_ps(sp); \ - __m128 b = _mm_load_ps(sp + 4); \ - __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm_store_ps(dpl, d); \ - _mm_store_ps(dph, c); \ - } \ - } - - ////////////////////////////////////////////////////////////////////////// - #define SSE_INTERLEAVE(dp, spl, sph, width, even) \ - { \ - if (even) \ - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) \ - { \ - __m128 a = _mm_load_ps(spl); \ - __m128 b = _mm_load_ps(sph); \ - __m128 c = _mm_unpacklo_ps(a, b); \ - __m128 d = _mm_unpackhi_ps(a, b); \ - _mm_store_ps(dp, c); \ - _mm_store_ps(dp + 4, d); \ - } \ - else \ - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) \ - { \ - __m128 a = _mm_load_ps(spl); \ - __m128 b = _mm_load_ps(sph); \ - __m128 c = _mm_unpacklo_ps(b, a); \ - __m128 d = _mm_unpackhi_ps(b, a); \ - _mm_store_ps(dp, c); \ - _mm_store_ps(dp + 4, d); \ - } \ - } - ////////////////////////////////////////////////////////////////////////// // Irreversible functions ////////////////////////////////////////////////////////////////////////// @@ -216,76 +165,6 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - // Supporting macros - ////////////////////////////////////////////////////////////////////////// - - ////////////////////////////////////////////////////////////////////////// - #define AVX_DEINTERLEAVE(dpl, dph, sp, width, even) \ - { \ - if (even) \ - { \ - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) \ - { \ - __m256 a = _mm256_load_ps(sp); \ - __m256 b = _mm256_load_ps(sp + 8); \ - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); \ - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); \ - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm256_store_ps(dpl, e); \ - _mm256_store_ps(dph, f); \ - } \ - } \ - else \ - { \ - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) \ - { \ - __m256 a = _mm256_load_ps(sp); \ - __m256 b = _mm256_load_ps(sp + 8); \ - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); \ - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); \ - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm256_store_ps(dpl, f); \ - _mm256_store_ps(dph, e); \ - } \ - } \ - } - - ////////////////////////////////////////////////////////////////////////// - #define AVX_INTERLEAVE(dp, spl, sph, width, even) \ - { \ - if (even) \ - { \ - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) \ - { \ - __m256 a = _mm256_load_ps(spl); \ - __m256 b = _mm256_load_ps(sph); \ - __m256 c = _mm256_unpacklo_ps(a, b); \ - __m256 d = _mm256_unpackhi_ps(a, b); \ - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); \ - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); \ - _mm256_store_ps(dp, e); \ - _mm256_store_ps(dp + 8, f); \ - } \ - } \ - else \ - { \ - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) \ - { \ - __m256 a = _mm256_load_ps(spl); \ - __m256 b = _mm256_load_ps(sph); \ - __m256 c = _mm256_unpacklo_ps(b, a); \ - __m256 d = _mm256_unpackhi_ps(b, a); \ - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); \ - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); \ - _mm256_store_ps(dp, e); \ - _mm256_store_ps(dp + 8, f); \ - } \ - } \ - } - ////////////////////////////////////////////////////////////////////////// // Irreversible functions ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp index 897a193..dcb5e53 100644 --- a/src/core/transform/ojph_transform_sse.cpp +++ b/src/core/transform/ojph_transform_sse.cpp @@ -50,6 +50,36 @@ namespace ojph { namespace local { + ////////////////////////////////////////////////////////////////////////// + static inline + void sse_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m128 a = _mm_load_ps(sp); + __m128 b = _mm_load_ps(sp + 4); + __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + _mm_store_ps(dpl, c); + _mm_store_ps(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse_interleave32(float* dp, float* spl, float* sph, int width) \ + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m128 a = _mm_load_ps(spl); + __m128 b = _mm_load_ps(sph); + __m128 c = _mm_unpacklo_ps(a, b); + __m128 d = _mm_unpackhi_ps(a, b); + _mm_store_ps(dp, c); + _mm_store_ps(dp + 4, d); + } + } + ////////////////////////////////////////////////////////////////////////// static inline void sse_multiply_const(float* p, float f, int width) { @@ -100,11 +130,11 @@ namespace ojph { { // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; float* sp = src->f32; int w = (int)width; - SSE_DEINTERLEAVE(dpl, dph, sp, w, even); + sse_deinterleave32(dpl, dph, sp, w); } // the actual horizontal transform @@ -235,10 +265,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - SSE_INTERLEAVE(dp, spl, sph, w, even); + sse_interleave32(dp, spl, sph, w); } } else { diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp index 8328842..a69b1fb 100644 --- a/src/core/transform/ojph_transform_sse2.cpp +++ b/src/core/transform/ojph_transform_sse2.cpp @@ -35,6 +35,7 @@ // Date: 28 August 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" @@ -52,13 +53,86 @@ namespace ojph { namespace local { ///////////////////////////////////////////////////////////////////////// - void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) + { + // note than m must be obtained using + // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt)); + __m128i x = _mm_srli_epi64(a, amt); + x = _mm_xor_si128(x, m); + __m128i result = _mm_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m128 a = _mm_load_ps(sp); + __m128 b = _mm_load_ps(sp + 4); + __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + _mm_store_ps(dpl, c); + _mm_store_ps(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_interleave32(float* dp, float* spl, float* sph, int width) \ + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m128 a = _mm_load_ps(spl); + __m128 b = _mm_load_ps(sph); + __m128 c = _mm_unpacklo_ps(a, b); + __m128 d = _mm_unpackhi_ps(a, b); + _mm_store_ps(dp, c); + _mm_store_ps(dp + 4, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_deinterleave64(double* dpl, double* dph, double* sp, int width) + { + for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) + { + __m128d a = _mm_load_pd(sp); + __m128d b = _mm_load_pd(sp + 2); + __m128d c = _mm_shuffle_pd(a, b, 0); + __m128d d = _mm_shuffle_pd(a, b, 3); + _mm_store_pd(dpl, c); + _mm_store_pd(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_interleave64(double* dp, double* spl, double* sph, int width) + { + for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) + { + __m128d a = _mm_load_pd(spl); + __m128d b = _mm_load_pd(sph); + __m128d c = _mm_unpacklo_pd(a, b); + __m128d d = _mm_unpackhi_pd(a, b); + _mm_store_pd(dp, c); + _mm_store_pd(dp + 2, d); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void sse2_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi32(b); si32* dst = aug->i32; @@ -162,19 +236,153 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + static + void sse2_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m128i vb = _mm_set1_epi64x(b); + __m128i ve = _mm_set1_epi64x(1LL << (63 - e)); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + } + else { // general case + // 64bit multiplication is not supported in sse2 + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + } + + ///////////////////////////////////////////////////////////////////////// + void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void sse2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { - // combine both lsrc and hsrc into dst + // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; float* sp = src->f32; int w = (int)width; - SSE_DEINTERLEAVE(dpl, dph, sp, w, even); + sse2_deinterleave32(dpl, dph, sp, w); } si32* hp = hdst->i32, * lp = ldst->i32; @@ -187,7 +395,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi32(b); // extension @@ -284,9 +492,7 @@ namespace ojph { } else { // general case - // 32bit multiplication is not supported in sse2; we need sse4.1, - // where we can use _mm_mullo_epi32, which multiplies - // 32bit x 32bit, keeping the LSBs + // 64bit multiplication is not supported in sse2 if (even) for (ui32 i = h_width; i > 0; --i, sp++, dp++) *dp += (b + a * (sp[0] + sp[1])) >> e; @@ -308,11 +514,179 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } + + ///////////////////////////////////////////////////////////////////////// + static + void sse2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // split src into ldst and hdst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)src->p; + int w = (int)width; + sse2_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m128i vb = _mm_set1_epi64x(b); + __m128i ve = _mm_set1_epi64x(1LL << (63 - e)); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else { + // general case + // 64bit multiplication is not supported in sse2 + if (even) + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[0] + sp[1])) >> e; + else + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, - ui32 width, bool even) + void sse2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -326,7 +700,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi32(b); // extension @@ -443,10 +817,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - SSE_INTERLEAVE(dp, spl, sph, w, even); + sse2_interleave32(dp, spl, sph, w); } } else { @@ -457,5 +831,172 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m128i vb = _mm_set1_epi64x(b); + __m128i ve = _mm_set1_epi64x(1LL << (63 - e)); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else { + // general case + // 64bit multiplication is not supported in sse2 + if (ev) + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + else + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[0] + sp[1])) >> e; + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + sse2_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp index bd652df..341cfc3 100644 --- a/src/core/transform/ojph_transform_wasm.cpp +++ b/src/core/transform/ojph_transform_wasm.cpp @@ -51,65 +51,69 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void wasm_deinterleave(float* dpl, float* dph, float* sp, - int width, bool even) + static inline + void wasm_deinterleave32(float* dpl, float* dph, float* sp, int width) { - if (even) - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) - { - v128_t a = wasm_v128_load(sp); - v128_t b = wasm_v128_load(sp + 4); - v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); - v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); - // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); - // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); - wasm_v128_store(dpl, c); - wasm_v128_store(dph, d); - } - else - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) - { - v128_t a = wasm_v128_load(sp); - v128_t b = wasm_v128_load(sp + 4); - v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); - v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); - // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); - // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); - wasm_v128_store(dpl, d); - wasm_v128_store(dph, c); - } + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + v128_t a = wasm_v128_load(sp); + v128_t b = wasm_v128_load(sp + 4); + v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); + v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); + // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + wasm_v128_store(dpl, c); + wasm_v128_store(dph, d); + } } ////////////////////////////////////////////////////////////////////////// - void wasm_interleave(float* dp, float* spl, float* sph, - int width, bool even) + static inline + void wasm_interleave32(float* dp, float* spl, float* sph, int width) { - if (even) - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) - { - v128_t a = wasm_v128_load(spl); - v128_t b = wasm_v128_load(sph); - v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1); - v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3); - // v128_t c = _mm_unpacklo_ps(a, b); - // v128_t d = _mm_unpackhi_ps(a, b); - wasm_v128_store(dp, c); - wasm_v128_store(dp + 4, d); - } - else - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) - { - v128_t a = wasm_v128_load(spl); - v128_t b = wasm_v128_load(sph); - v128_t c = wasm_i32x4_shuffle(b, a, 0, 4 + 0, 1, 4 + 1); - v128_t d = wasm_i32x4_shuffle(b, a, 2, 4 + 2, 3, 4 + 3); - // v128_t c = _mm_unpacklo_ps(b, a); - // v128_t d = _mm_unpackhi_ps(b, a); - wasm_v128_store(dp, c); - wasm_v128_store(dp + 4, d); - } + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + v128_t a = wasm_v128_load(spl); + v128_t b = wasm_v128_load(sph); + v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1); + v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3); + // v128_t c = _mm_unpacklo_ps(a, b); + // v128_t d = _mm_unpackhi_ps(a, b); + wasm_v128_store(dp, c); + wasm_v128_store(dp + 4, d); + } } + ////////////////////////////////////////////////////////////////////////// + static inline + void wasm_deinterleave64(double* dpl, double* dph, double* sp, int width) + { + for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) + { + v128_t a = wasm_v128_load(sp); + v128_t b = wasm_v128_load(sp + 2); + v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0); + v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1); + wasm_v128_store(dpl, c); + wasm_v128_store(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void wasm_interleave64(double* dp, double* spl, double* sph, int width) + { + for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) + { + v128_t a = wasm_v128_load(spl); + v128_t b = wasm_v128_load(sph); + v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0); + v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1); + wasm_v128_store(dp, c); + wasm_v128_store(dp + 2, d); + } + } + ////////////////////////////////////////////////////////////////////////// static inline void wasm_multiply_const(float* p, float f, int width) { @@ -159,7 +163,13 @@ namespace ojph { if (width > 1) { // split src into ldst and hdst - wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + wasm_deinterleave32(dpl, dph, sp, w); + } // the actual horizontal transform float* hp = hdst->f32, * lp = ldst->f32; @@ -287,7 +297,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + wasm_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -298,13 +314,13 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + void wasm_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; v128_t va = wasm_i32x4_splat(a); v128_t vb = wasm_i32x4_splat(b); @@ -428,14 +444,174 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + void wasm_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = wasm_i64x2_splat(a); + v128_t vb = wasm_i64x2_splat(b); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + else + { // general case + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + } + + ///////////////////////////////////////////////////////////////////////// + void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + wasm_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + wasm_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void wasm_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { // combine both lsrc and hsrc into dst - wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + wasm_deinterleave32(dpl, dph, sp, w); + } si32* hp = hdst->i32, * lp = ldst->i32; ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass @@ -447,7 +623,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; v128_t va = wasm_i32x4_splat(a); v128_t vb = wasm_i32x4_splat(b); @@ -587,11 +763,199 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } - - ////////////////////////////////////////////////////////////////////////// - void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + + ///////////////////////////////////////////////////////////////////////// + static + void wasm_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)src->p; + int w = (int)width; + wasm_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = wasm_i64x2_splat(a); + v128_t vb = wasm_i64x2_splat(b); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + wasm_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + wasm_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -605,7 +969,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; v128_t va = wasm_i32x4_splat(a); v128_t vb = wasm_i32x4_splat(b); @@ -739,7 +1103,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + wasm_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -749,5 +1119,192 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = wasm_i64x2_splat(a); + v128_t vb = wasm_i64x2_splat(b); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + wasm_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + wasm_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + wasm_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 000409f..8cc1d72 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -3,7 +3,7 @@ include(FetchContent) FetchContent_Declare( googletest - URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz + URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz EXCLUDE_FROM_ALL ) # For Windows: Prevent overriding the parent project's compiler/linker settings diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp index 9f77f75..22f148e 100644 --- a/tests/test_executables.cpp +++ b/tests/test_executables.cpp @@ -107,8 +107,27 @@ int execute(const std::string& cmd, std::string& result) #define REF_FILE_DIR "./jp2k_test_codestreams/openjph/references/" #define MSE_PAE_PATH "./mse_pae" #define COMPARE_FILES_PATH "./compare_files" + +// This is a comment to me, to help with emscripten testing. +// This is written after the completion of the tests. +// 1. Compile for the target platform (Linux), selecting from the following +// code the version that suits you; in particular it should be the one +// the uses node. Ideally create two versions of test_executables, one +// for WASM SIMD, and for WASM without SIMD -- use linux cp command to +// create test_executables_simd and test_executables_no_simd +// 2. Compile again, without deleting what compiled; this time compile using +// emscripten, targeting WASM. The compilation is very finicky, do +// 'make clean && make' after every change in code. +// 3. cd to tests, and run test_executables_simd or test_executables_no_simd. + #define EXPAND_EXECUTABLE "./ojph_expand" #define COMPRESS_EXECUTABLE "./ojph_compress" +//#define EXPAND_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_expand.js" +//#define COMPRESS_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_compress.js" +//#define EXPAND_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_expand_simd.js" +//#define COMPRESS_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_compress_simd.js" +//#define EXPAND_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_expand" +//#define COMPRESS_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_compress" #endif #define TOL_DOUBLE 0.01 #define TOL_INTEGER 1