From 61fedfbe98f474dbdbf78d8b48487d4e6f4b3d49 Mon Sep 17 00:00:00 2001 From: Tomasz Szumski Date: Thu, 6 Jun 2024 09:41:11 +0200 Subject: [PATCH 01/78] Fix bitstream mismatch between ojph_encode_codeblock() and ojph_encode_codeblock_avx512() --- src/core/coding/ojph_block_encoder_avx512.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/core/coding/ojph_block_encoder_avx512.cpp b/src/core/coding/ojph_block_encoder_avx512.cpp index 5912b09f..9df0e8ef 100644 --- a/src/core/coding/ojph_block_encoder_avx512.cpp +++ b/src/core/coding/ojph_block_encoder_avx512.cpp @@ -377,6 +377,13 @@ namespace ojph { if (melp->run > 0) mel_emit_bit(melp, 1); + if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) { + *(vlcp->buf - vlcp->pos) = 0x7f; + vlcp->pos++; + vlcp->tmp >>= 7; + vlcp->used_bits -= 7; + } + melp->tmp = melp->tmp << melp->remaining_bits; int mel_mask = (0xFF << melp->remaining_bits) & 0xFF; int vlc_mask = 0xFF >> (8 - vlcp->used_bits); From 1021cc833e2f9fb2da62c6053a8c11be35a19fa0 Mon Sep 17 00:00:00 2001 From: Tomasz Szumski Date: Mon, 3 Jun 2024 13:10:16 +0200 Subject: [PATCH 02/78] ojph_encode_codeblock_avx2() implementation --- src/core/CMakeLists.txt | 8 +- src/core/codestream/ojph_codeblock_fun.cpp | 1 + src/core/coding/ojph_block_encoder.h | 7 + src/core/coding/ojph_block_encoder_avx2.cpp | 1213 +++++++++++++++++++ 4 files changed, 1227 insertions(+), 2 deletions(-) create mode 100644 src/core/coding/ojph_block_encoder_avx2.cpp diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 555de0ec..94bcfcc3 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -10,6 +10,7 @@ file(GLOB CODESTREAM_WASM "codestream/*_wasm.cpp") file(GLOB CODING "coding/*.cpp" "coding/*.h") file(GLOB CODING_SSSE3 "coding/*_ssse3.cpp") file(GLOB CODING_WASM "coding/*_wasm.cpp") +file(GLOB CODING_AVX2 "coding/*_avx2.cpp") file(GLOB CODING_AVX512 "coding/*_avx512.cpp") file(GLOB COMMON "common/*.h") file(GLOB OTHERS "others/*.cpp") @@ -22,7 +23,7 @@ file(GLOB TRANSFORM_AVX512 "transform/*_avx512.cpp") file(GLOB TRANSFORM_WASM "transform/*_wasm.cpp") list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM}) -list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX512}) +list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX2} ${CODING_AVX512}) list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512} ${TRANSFORM_WASM}) list(APPEND SOURCES ${CODESTREAM} ${CODING} ${COMMON} ${OTHERS} ${TRANSFORM}) @@ -70,9 +71,10 @@ else() source_group("transform" FILES ${TRANSFORM_AVX}) endif() if (NOT OJPH_DISABLE_AVX2) - list(APPEND SOURCES ${CODESTREAM_AVX2} ${TRANSFORM_AVX2}) + list(APPEND SOURCES ${CODESTREAM_AVX2} ${TRANSFORM_AVX2} ${CODING_AVX2}) source_group("codestream" FILES ${CODESTREAM_AVX2}) source_group("transform" FILES ${TRANSFORM_AVX2}) + source_group("coding" FILES ${CODING_AVX2}) endif() if ((NOT OJPH_DISABLE_AVX512) AND ("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_X86_64")) list(APPEND SOURCES ${CODING_AVX512} ${TRANSFORM_AVX512}) @@ -84,6 +86,7 @@ else() if (MSVC) set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX") set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") + set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512") set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX") set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") @@ -94,6 +97,7 @@ else() set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS -mavx) set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) set_source_files_properties(coding/ojph_block_decoder_ssse3.cpp PROPERTIES COMPILE_FLAGS -mssse3) + set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512cd) set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS -mavx) set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index cf51530b..732d5263 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -169,6 +169,7 @@ namespace ojph { tx_to_cb = avx2_irv_tx_to_cb; tx_from_cb = avx2_irv_tx_from_cb; } + encode_cb = ojph_encode_codeblock_avx2; } #endif // !OJPH_DISABLE_AVX2 diff --git a/src/core/coding/ojph_block_encoder.h b/src/core/coding/ojph_block_encoder.h index 0c4b9267..43d32d8b 100644 --- a/src/core/coding/ojph_block_encoder.h +++ b/src/core/coding/ojph_block_encoder.h @@ -58,6 +58,13 @@ namespace ojph { ojph::mem_elastic_allocator *elastic, ojph::coded_lists *& coded); + void + ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs, + ui32 num_passes, ui32 width, ui32 height, + ui32 stride, ui32* lengths, + ojph::mem_elastic_allocator* elastic, + ojph::coded_lists*& coded); + void ojph_encode_codeblock_avx512(ui32* buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, diff --git a/src/core/coding/ojph_block_encoder_avx2.cpp b/src/core/coding/ojph_block_encoder_avx2.cpp new file mode 100644 index 00000000..d579f83a --- /dev/null +++ b/src/core/coding/ojph_block_encoder_avx2.cpp @@ -0,0 +1,1213 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2019, The University of New South Wales, Australia +// Copyright (c) 2024, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_block_encoder_avx2.cpp +//***************************************************************************/ + +#include +#include +#include +#include +#include + +#include "ojph_mem.h" +#include "ojph_arch.h" +#include "ojph_block_encoder.h" +#include "ojph_message.h" + +#ifdef OJPH_COMPILER_MSVC + #define likely(x) (x) + #define unlikely(x) (x) +#else + #define likely(x) __builtin_expect((x), 1) + #define unlikely(x) __builtin_expect((x), 0) +#endif + +namespace ojph { + namespace local { + + ///////////////////////////////////////////////////////////////////////// + // tables + ///////////////////////////////////////////////////////////////////////// + + //VLC encoding + // index is (c_q << 8) + (rho << 4) + eps + // data is (cwd << 8) + (cwd_len << 4) + eps + // table 0 is for the initial line of quads + static ui32 vlc_tbl0[2048] = { 0 }; + static ui32 vlc_tbl1[2048] = { 0 }; + + //UVLC encoding + static ui32 ulvc_cwd_pre[33]; + static int ulvc_cwd_pre_len[33]; + static ui32 ulvc_cwd_suf[33]; + static int ulvc_cwd_suf_len[33]; + + ///////////////////////////////////////////////////////////////////////// + static bool vlc_init_tables() + { + struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; }; + vlc_src_table tbl0[] = { + #include "table0.h" + }; + size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table); + + si32 pattern_popcnt[16]; + for (ui32 i = 0; i < 16; ++i) + pattern_popcnt[i] = (si32)population_count(i); + + vlc_src_table* src_tbl = tbl0; + ui32 *tgt_tbl = vlc_tbl0; + size_t tbl_size = tbl0_size; + for (int i = 0; i < 2048; ++i) + { + int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF; + if (((emb & rho) != emb) || (rho == 0 && c_q == 0)) + tgt_tbl[i] = 0; + else + { + vlc_src_table *best_entry = NULL; + if (emb) // u_off = 1 + { + int best_e_k = -1; + for (size_t j = 0; j < tbl_size; ++j) + { + if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho) + if (src_tbl[j].u_off == 1) + if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1) + { + //now we need to find the smallest cwd with the highest + // number of bits set in e_k + int ones_count = pattern_popcnt[src_tbl[j].e_k]; + if (ones_count >= best_e_k) + { + best_entry = src_tbl + j; + best_e_k = ones_count; + } + } + } + } + else // u_off = 0 + { + for (size_t j = 0; j < tbl_size; ++j) + { + if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho) + if (src_tbl[j].u_off == 0) + { + best_entry = src_tbl + j; + break; + } + } + } + assert(best_entry); + tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4) + + best_entry->e_k); + } + } + + vlc_src_table tbl1[] = { + #include "table1.h" + }; + size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table); + + src_tbl = tbl1; + tgt_tbl = vlc_tbl1; + tbl_size = tbl1_size; + for (int i = 0; i < 2048; ++i) + { + int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF; + if (((emb & rho) != emb) || (rho == 0 && c_q == 0)) + tgt_tbl[i] = 0; + else + { + vlc_src_table *best_entry = NULL; + if (emb) // u_off = 1 + { + int best_e_k = -1; + for (size_t j = 0; j < tbl_size; ++j) + { + if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho) + if (src_tbl[j].u_off == 1) + if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1) + { + //now we need to find the smallest cwd with the highest + // number of bits set in e_k + int ones_count = pattern_popcnt[src_tbl[j].e_k]; + if (ones_count >= best_e_k) + { + best_entry = src_tbl + j; + best_e_k = ones_count; + } + } + } + } + else // u_off = 0 + { + for (size_t j = 0; j < tbl_size; ++j) + { + if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho) + if (src_tbl[j].u_off == 0) + { + best_entry = src_tbl + j; + break; + } + } + } + assert(best_entry); + tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4) + + best_entry->e_k); + } + } + + + return true; + } + + ///////////////////////////////////////////////////////////////////////// + static bool uvlc_init_tables() + { + //code goes from 0 to 31, extension and 32 are not supported here + ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2; + ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4; + ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1; + ulvc_cwd_pre_len[2] = 2; + ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3; + ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0; + ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1; + ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0; + ulvc_cwd_suf_len[2] = 0; + ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1; + for (int i = 5; i < 33; ++i) + { + ulvc_cwd_pre[i] = 0; + ulvc_cwd_pre_len[i] = 3; + ulvc_cwd_suf[i] = (ui32)(i-5); + ulvc_cwd_suf_len[i] = 5; + } + return true; + } + + ///////////////////////////////////////////////////////////////////////// + bool initialize_tables_avx2() { + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { + bool result; + result = vlc_init_tables(); + result = result && uvlc_init_tables(); + return result; + } + return false; + } + + ///////////////////////////////////////////////////////////////////////// + static bool tables_initialized = initialize_tables_avx2(); + + ///////////////////////////////////////////////////////////////////////// + // + ///////////////////////////////////////////////////////////////////////// + struct mel_struct { + //storage + ui8* buf; //pointer to data buffer + ui32 pos; //position of next writing within buf + ui32 buf_size; //size of buffer, which we must not exceed + + // all these can be replaced by bytes + int remaining_bits; //number of empty bits in tmp + int tmp; //temporary storage of coded bits + int run; //number of 0 run + int k; //state + int threshold; //threshold where one bit must be coded + }; + + ////////////////////////////////////////////////////////////////////////// + static inline void + mel_init(mel_struct* melp, ui32 buffer_size, ui8* data) + { + melp->buf = data; + melp->pos = 0; + melp->buf_size = buffer_size; + melp->remaining_bits = 8; + melp->tmp = 0; + melp->run = 0; + melp->k = 0; + melp->threshold = 1; // this is 1 << mel_exp[melp->k]; + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + mel_emit_bit(mel_struct* melp, int v) + { + melp->tmp = (melp->tmp << 1) + v; + melp->remaining_bits--; + if (melp->remaining_bits == 0) { + melp->buf[melp->pos++] = (ui8)melp->tmp; + melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8); + melp->tmp = 0; + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + mel_encode(mel_struct* melp, bool bit) + { + //MEL exponent + static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5}; + + if (bit == false) { + ++melp->run; + if (melp->run >= melp->threshold) { + mel_emit_bit(melp, 1); + melp->run = 0; + melp->k = ojph_min(12, melp->k + 1); + melp->threshold = 1 << mel_exp[melp->k]; + } + } else { + mel_emit_bit(melp, 0); + int t = mel_exp[melp->k]; + while (t > 0) { + mel_emit_bit(melp, (melp->run >> --t) & 1); + } + melp->run = 0; + melp->k = ojph_max(0, melp->k - 1); + melp->threshold = 1 << mel_exp[melp->k]; + } + } + + ///////////////////////////////////////////////////////////////////////// + // + ///////////////////////////////////////////////////////////////////////// + struct vlc_struct { + //storage + ui8* buf; //pointer to data buffer + ui32 pos; //position of next writing within buf + ui32 buf_size; //size of buffer, which we must not exceed + + int used_bits; //number of occupied bits in tmp + ui64 tmp; //temporary storage of coded bits + bool last_greater_than_8F; //true if last byte us greater than 0x8F + }; + + ////////////////////////////////////////////////////////////////////////// + static inline void + vlc_init(vlc_struct* vlcp, ui32 buffer_size, ui8* data) + { + vlcp->buf = data + buffer_size - 1; //points to last byte + vlcp->pos = 1; //locations will be all -pos + vlcp->buf_size = buffer_size; + + vlcp->buf[0] = 0xFF; + vlcp->used_bits = 4; + vlcp->tmp = 0xF; + vlcp->last_greater_than_8F = true; + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + vlc_encode(vlc_struct* vlcp, ui32 cwd, int cwd_len) + { + vlcp->tmp |= (ui64)cwd << vlcp->used_bits; + vlcp->used_bits += cwd_len; + + while (vlcp->used_bits >= 8) { + ui8 tmp; + + if (unlikely(vlcp->last_greater_than_8F)) { + tmp = vlcp->tmp & 0x7F; + + if (likely(tmp != 0x7F)) { + tmp = vlcp->tmp & 0xFF; + *(vlcp->buf - vlcp->pos) = tmp; + vlcp->last_greater_than_8F = tmp > 0x8F; + vlcp->tmp >>= 8; + vlcp->used_bits -= 8; + } else { + *(vlcp->buf - vlcp->pos) = tmp; + vlcp->last_greater_than_8F = false; + vlcp->tmp >>= 7; + vlcp->used_bits -= 7; + } + + } else { + tmp = vlcp->tmp & 0xFF; + *(vlcp->buf - vlcp->pos) = tmp; + vlcp->last_greater_than_8F = tmp > 0x8F; + vlcp->tmp >>= 8; + vlcp->used_bits -= 8; + } + + vlcp->pos++; + } + } + + ////////////////////////////////////////////////////////////////////////// + // + ////////////////////////////////////////////////////////////////////////// + static inline void + terminate_mel_vlc(mel_struct* melp, vlc_struct* vlcp) + { + if (melp->run > 0) + mel_emit_bit(melp, 1); + + if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) { + *(vlcp->buf - vlcp->pos) = 0x7f; + vlcp->pos++; + vlcp->tmp >>= 7; + vlcp->used_bits -= 7; + } + + melp->tmp = melp->tmp << melp->remaining_bits; + int mel_mask = (0xFF << melp->remaining_bits) & 0xFF; + int vlc_mask = 0xFF >> (8 - vlcp->used_bits); + if ((mel_mask | vlc_mask) == 0) + return; //last mel byte cannot be 0xFF, since then + //melp->remaining_bits would be < 8 + if (melp->pos >= melp->buf_size) + OJPH_ERROR(0x00020003, "mel encoder's buffer is full"); + ui8 vlcp_tmp = (ui8)vlcp->tmp; + int fuse = melp->tmp | vlcp_tmp; + if ( ( ((fuse ^ melp->tmp) & mel_mask) + | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0 + && (fuse != 0xFF) && vlcp->pos > 1) + { + melp->buf[melp->pos++] = (ui8)fuse; + } + else + { + if (vlcp->pos >= vlcp->buf_size) + OJPH_ERROR(0x00020004, "vlc encoder's buffer is full"); + melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF + *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp; + vlcp->pos++; + } + } + +///////////////////////////////////////////////////////////////////////// +// +///////////////////////////////////////////////////////////////////////// + struct ms_struct { + //storage + ui8* buf; //pointer to data buffer + ui32 pos; //position of next writing within buf + ui32 buf_size; //size of buffer, which we must not exceed + + int max_bits; //maximum number of bits that can be store in tmp + int used_bits; //number of occupied bits in tmp + ui32 tmp; //temporary storage of coded bits + }; + + ////////////////////////////////////////////////////////////////////////// + static inline void + ms_init(ms_struct* msp, ui32 buffer_size, ui8* data) + { + msp->buf = data; + msp->pos = 0; + msp->buf_size = buffer_size; + msp->max_bits = 8; + msp->used_bits = 0; + msp->tmp = 0; + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + ms_encode(ms_struct* msp, ui64 cwd, int cwd_len) + { + while (cwd_len > 0) + { + if (msp->pos >= msp->buf_size) + OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full"); + int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len); + msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits; + msp->used_bits += t; + cwd >>= t; + cwd_len -= t; + if (msp->used_bits >= msp->max_bits) + { + msp->buf[msp->pos++] = (ui8)msp->tmp; + msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8; + msp->tmp = 0; + msp->used_bits = 0; + } + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline void + ms_terminate(ms_struct* msp) + { + if (msp->used_bits) + { + int t = msp->max_bits - msp->used_bits; //unused bits + msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits; + msp->used_bits += t; + if (msp->tmp != 0xFF) + { + if (msp->pos >= msp->buf_size) + OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full"); + msp->buf[msp->pos++] = (ui8)msp->tmp; + } + } + else if (msp->max_bits == 7) + msp->pos--; + } + +#define ZERO _mm256_setzero_si256() +#define ONE _mm256_set1_epi32(1) + +// https://stackoverflow.com/a/58827596 +inline __m256i avx2_lzcnt_epi32(__m256i v) { + // prevent value from being rounded up to the next power of two + v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB + + v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float + v = _mm256_srli_epi32(v, 23); // shift down the exponent + v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias + v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32 + + return v; +} + +inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) { + return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff)); +} + +static void proc_pixel(__m256i *src_vec, ui32 p, + __m256i *eq_vec, __m256i *s_vec, + __m256i &rho_vec, __m256i &e_qmax_vec) +{ + __m256i val_vec[4]; + __m256i _eq_vec[4]; + __m256i _s_vec[4]; + __m256i _rho_vec[4]; + + for (ui32 i = 0; i < 4; ++i) { + /* val = t + t; //multiply by 2 and get rid of sign */ + val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]); + + /* val >>= p; // 2 \mu_p + x */ + val_vec[i] = _mm256_srli_epi32(val_vec[i], (int)p); + + /* val &= ~1u; // 2 \mu_p */ + val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((int)~1u)); + + /* if (val) { */ + const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO); + + /* rho[i] = 1 << i; + * rho is processed below. + */ + + /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */ + val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE); + _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]); + _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]); + + /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]); + * e_qmax is processed below + */ + + /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */ + val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE); + _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31); + _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]); + + _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask); + _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask); + val_vec[i] = _mm256_srli_epi32(val_notmask, 31); + /* } */ + } + + const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + + /* Reorder from + * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7] + * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],.[1, 6], [1, 7] + * *_vec[2]:[0, 8], [0, 9], [0,10], [0,11], [0,12], [0,13], [0,14], [0,15] + * *_vec[3]:[1, 8], [1, 9], [1,10], [1,11], [1,12], [1,13], [1,14], [1,15] + * to + * *_vec[0]:[0, 0], [0, 2], [0, 4], [0, 6], [0, 8], [0,10], [0,12], [0,14] + * *_vec[1]:[1, 0], [1, 2], [1, 4], [1, 6], [1, 8], [1,10], [1,12], [1,14] + * *_vec[2]:[0, 1], [0, 3], [0, 5], [0, 7], [0, 9], [0,11], [0,13], [0,15] + * *_vec[3]:[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [1,11], [1,13], [1,15] + */ + __m256i tmp1, tmp2; + for (ui32 i = 0; i < 2; ++i) { + tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx); + tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx); + eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4)); + eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4)); + + tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx); + tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx); + s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4)); + s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4)); + + tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx); + tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx); + _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4)); + _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4)); + } + + e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]); + e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]); + e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]); + _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1); + _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2); + _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3); + rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]); + rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]); + rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]); +} + +/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...] + * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...] + * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...] + * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...] + * + * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31, + * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33] + * + * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35, + * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37] + * + * [..] + */ +static void rotate_matrix(__m256i *matrix) +{ + __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]); + __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]); + __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]); + __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]); + + matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2); + matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4); + matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2); + matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4); + + tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20); + matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31); + matrix[0] = tmp1; + + tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20); + matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31); + matrix[1] = tmp1; +} + +static void proc_ms_encode(ms_struct *msp, + __m256i &tuple_vec, + __m256i &uq_vec, + __m256i &rho_vec, + __m256i *s_vec) +{ + __m256i m_vec[4]; + + /* Prepare parameters for ms_encode */ + /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */ + auto tmp = _mm256_and_si256(tuple_vec, ONE); + tmp = _mm256_sub_epi32(uq_vec, tmp); + auto tmp1 = _mm256_and_si256(rho_vec, ONE); + auto mask = avx2_cmpneq_epi32(tmp1, ZERO); + m_vec[0] = _mm256_and_si256(mask, tmp); + + /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */ + tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2)); + tmp = _mm256_srli_epi32(tmp, 1); + tmp = _mm256_sub_epi32(uq_vec, tmp); + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2)); + mask = avx2_cmpneq_epi32(tmp1, ZERO); + m_vec[1] = _mm256_and_si256(mask, tmp); + + /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */ + tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4)); + tmp = _mm256_srli_epi32(tmp, 2); + tmp = _mm256_sub_epi32(uq_vec, tmp); + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4)); + mask = avx2_cmpneq_epi32(tmp1, ZERO); + m_vec[2] = _mm256_and_si256(mask, tmp); + + /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */ + tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8)); + tmp = _mm256_srli_epi32(tmp, 3); + tmp = _mm256_sub_epi32(uq_vec, tmp); + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8)); + mask = avx2_cmpneq_epi32(tmp1, ZERO); + m_vec[3] = _mm256_and_si256(mask, tmp); + + rotate_matrix(m_vec); + /* s_vec from + * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30] + * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30] + * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31] + * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31] + * to + * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7] + * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15] + * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23] + * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31] + */ + rotate_matrix(s_vec); + + ui32 cwd[8]; + int cwd_len[8]; + ui64 _cwd = 0; + int _cwd_len = 0; + + /* Each iteration process 8 bytes * 2 lines */ + for (ui32 i = 0; i < 4; ++i) { + /* cwd = s[i * 4 + 0] & ((1U << m) - 1) + * cwd_len = m + */ + _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]); + tmp = _mm256_sllv_epi32(ONE, m_vec[i]); + tmp = _mm256_sub_epi32(tmp, ONE); + tmp = _mm256_and_si256(tmp, s_vec[i]); + _mm256_storeu_si256((__m256i*)cwd, tmp); + + for (ui32 j = 0; j < 4; ++j) { + ui32 idx = j * 2; + _cwd = cwd[idx]; + _cwd_len = cwd_len[idx]; + _cwd |= ((ui64)cwd[idx + 1]) << _cwd_len; + _cwd_len += cwd_len[idx + 1]; + ms_encode(msp, _cwd, _cwd_len); + } + } +} + +static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec, + __m256i &e_qmax_vec) +{ + /* if (u_q[i] > 0) { + * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]); + * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1; + * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2; + * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3; + * } + */ + auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO); + + auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec); + auto eps_vec = _mm256_srli_epi32(mask, 31); + + mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec); + auto tmp = _mm256_srli_epi32(mask, 31); + tmp = _mm256_slli_epi32(tmp, 1); + eps_vec = _mm256_or_si256(eps_vec, tmp); + + mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec); + tmp = _mm256_srli_epi32(mask, 31); + tmp = _mm256_slli_epi32(tmp, 2); + eps_vec = _mm256_or_si256(eps_vec, tmp); + + mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec); + tmp = _mm256_srli_epi32(mask, 31); + tmp = _mm256_slli_epi32(tmp, 3); + eps_vec = _mm256_or_si256(eps_vec, tmp); + + return _mm256_and_si256(u_q_mask, eps_vec); +} + +static void update_lep(ui32 x, __m256i &prev_e_val_vec, + __m256i *eq_vec, __m256i *e_val_vec, + const __m256i left_shift) +{ + /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++; + * lep[0] = (ui8)e_q[3]; + * Compare e_q[1] with e_q[3] of the prevous round. + */ + auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift); + tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0); + prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0); + e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp); +} + + +static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec, + __m256i &rho_vec, __m256i *cx_val_vec, + const __m256i left_shift) +{ + /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++; + * lcxp[0] = (ui8)((rho[0] & 8) >> 3); + * Or (rho[0] & 2) and (rho[0] of the previous round & 8). + */ + auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift); + tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0); + prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0); + + tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8)); + tmp = _mm256_srli_epi32(tmp, 3); + + auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2)); + tmp1 = _mm256_srli_epi32(tmp1, 1); + cx_val_vec[x] = _mm256_or_si256(tmp, tmp1); +} + +static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec, + __m256i &eps_vec, ui32 *vlc_tbl) +{ + /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */ + auto tmp = _mm256_slli_epi32(cq_vec, 8); + auto tmp1 = _mm256_slli_epi32(rho_vec, 4); + tmp = _mm256_add_epi32(tmp, tmp1); + tmp = _mm256_add_epi32(tmp, eps_vec); + return _mm256_i32gather_epi32((const int *)vlc_tbl, tmp, 4); +} + +static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec, + const __m256i right_shift) +{ + ojph_unused(x); + ojph_unused(cx_val_vec); + ojph_unused(right_shift); + + /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */ + auto tmp = _mm256_srli_epi32(rho_vec, 1); + auto tmp1 = _mm256_and_si256(rho_vec, ONE); + return _mm256_or_si256(tmp, tmp1); +} + +static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec, + const __m256i right_shift) +{ + // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2)) + // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2)); + auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift); + auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift); + + tmp = _mm256_insert_epi64(tmp, _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3); + tmp = _mm256_slli_epi32(tmp, 2); + auto tmp1 = _mm256_insert_epi32(lcxp1_vec, _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7); + tmp = _mm256_add_epi32(tmp1, tmp); + + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4)); + tmp1 = _mm256_srli_epi32(tmp1, 1); + tmp = _mm256_or_si256(tmp, tmp1); + + tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8)); + tmp1 = _mm256_srli_epi32(tmp1, 2); + + return _mm256_or_si256(tmp, tmp1); +} + +using fn_proc_cq = __m256i (*)(ui32, __m256i *, __m256i &, const __m256i); + +static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec, + __m256i &rho_vec, __m256i u_q_vec, ui32 ignore, + const __m256i right_shift) +{ + int32_t mel_need_encode[8]; + int32_t mel_need_encode2[8]; + int32_t mel_bit[8]; + int32_t mel_bit2[8]; + /* Prepare mel_encode params */ + /* if (c_q[i] == 0) { */ + _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO)); + /* mel_encode(&mel, rho[i] != 0); */ + _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31)); + /* } */ + + /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */ + auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift); + auto tmp1 = _mm256_min_epi32(u_q_vec, tmp); + _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31)); + + /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */ + auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO); + _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO))); + + ui32 i_max = 8 - (ignore / 2); + + for (ui32 i = 0; i < i_max; i += 2) { + if (mel_need_encode[i]) { + mel_encode(melp, mel_bit[i]); + } + + if (i + 1 < i_max) { + if (mel_need_encode[i + 1]) { + mel_encode(melp, mel_bit[i + 1]); + } + } + + if (mel_need_encode2[i]) { + mel_encode(melp, mel_bit2[i]); + } + } +} + +static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec, + __m256i &rho_vec, __m256i u_q_vec, ui32 ignore, + const __m256i right_shift) +{ + ojph_unused(u_q_vec); + ojph_unused(right_shift); + int32_t mel_need_encode[8]; + int32_t mel_bit[8]; + + /* Prepare mel_encode params */ + /* if (c_q[i] == 0) { */ + _mm256_storeu_si256((__m256i*)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO)); + /* mel_encode(&mel, rho[i] != 0); */ + _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31)); + /* } */ + + ui32 i_max = 8 - (ignore / 2); + + for (ui32 i = 0; i < i_max; ++i) { + if (mel_need_encode[i]) { + mel_encode(melp, mel_bit[i]); + } + } +} + +using fn_proc_mel_encode = void (*)(mel_struct *, __m256i &, __m256i &, + __m256i, ui32, const __m256i); + +static void proc_vlc_encode1(vlc_struct *vlcp, ui32 *tuple, + ui32 *u_q, ui32 ignore) +{ + ui32 i_max = 8 - (ignore / 2); + + for (ui32 i = 0; i < i_max; i += 2) { + /* 7 bits */ + ui32 val = tuple[i + 0] >> 4; + int size = tuple[i + 0] & 7; + + if (i + 1 < i_max) { + /* 7 bits */ + val |= (tuple[i + 1] >> 4) << size; + size += tuple[i + 1] & 7; + } + + if (u_q[i] > 2 && u_q[i + 1] > 2) { + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i] - 2]) << size; + size += ulvc_cwd_pre_len[u_q[i] - 2]; + + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size; + size += ulvc_cwd_pre_len[u_q[i + 1] - 2]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i] - 2]) << size; + size += ulvc_cwd_suf_len[u_q[i] - 2]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size; + size += ulvc_cwd_suf_len[u_q[i + 1] - 2]; + + } else if (u_q[i] > 2 && u_q[i + 1] > 0) { + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i]]) << size; + size += ulvc_cwd_pre_len[u_q[i]]; + + /* 1 bit */ + val |= (u_q[i + 1] - 1) << size; + size += 1; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i]]) << size; + size += ulvc_cwd_suf_len[u_q[i]]; + + } else { + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i]]) << size; + size += ulvc_cwd_pre_len[u_q[i]]; + + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i + 1]]) << size; + size += ulvc_cwd_pre_len[u_q[i + 1]]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i]]) << size; + size += ulvc_cwd_suf_len[u_q[i]]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i + 1]]) << size; + size += ulvc_cwd_suf_len[u_q[i + 1]]; + } + + vlc_encode(vlcp, val, size); + } +} + +static void proc_vlc_encode2(vlc_struct *vlcp, ui32 *tuple, + ui32 *u_q, ui32 ignore) +{ + ui32 i_max = 8 - (ignore / 2); + + for (ui32 i = 0; i < i_max; i += 2) { + /* 7 bits */ + ui32 val = tuple[i + 0] >> 4; + int size = tuple[i + 0] & 7; + + if (i + 1 < i_max) { + /* 7 bits */ + val |= (tuple[i + 1] >> 4) << size; + size += tuple[i + 1] & 7; + } + + /* 3 bits */ + val |= ulvc_cwd_pre[u_q[i]] << size; + size += ulvc_cwd_pre_len[u_q[i]]; + + /* 3 bits */ + val |= (ulvc_cwd_pre[u_q[i + 1]]) << size; + size += ulvc_cwd_pre_len[u_q[i + 1]]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i + 0]]) << size; + size += ulvc_cwd_suf_len[u_q[i + 0]]; + + /* 5 bits */ + val |= (ulvc_cwd_suf[u_q[i + 1]]) << size; + size += ulvc_cwd_suf_len[u_q[i + 1]]; + + vlc_encode(vlcp, val, size); + } +} + +using fn_proc_vlc_encode = void (*)(vlc_struct *, ui32 *, ui32 *, ui32); + +void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs, + ui32 num_passes, ui32 _width, ui32 height, + ui32 stride, ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded) +{ + ojph_unused(num_passes); //currently not used + + ui32 width = (_width + 15) & ~15u; + ui32 ignore = width - _width; + const int ms_size = (16384 * 16 + 14) / 15; //more than enough + const int mel_vlc_size = 3072; //more than enough + const int mel_size = 192; + const int vlc_size = mel_vlc_size - mel_size; + + ui8 ms_buf[ms_size]; + ui8 mel_vlc_buf[mel_vlc_size]; + ui8 *mel_buf = mel_vlc_buf; + ui8 *vlc_buf = mel_vlc_buf + mel_size; + + mel_struct mel; + mel_init(&mel, mel_size, mel_buf); + vlc_struct vlc; + vlc_init(&vlc, vlc_size, vlc_buf); + ms_struct ms; + ms_init(&ms, ms_size, ms_buf); + + const ui32 p = 30 - missing_msbs; + + //e_val: E values for a line (these are the highest set bit) + //cx_val: is the context values + //Each byte stores the info for the 2 sample. For E, it is maximum + // of the two samples, while for cx, it is the OR of these two samples. + //The maximum is between the pixel at the bottom left of one quad + // and the bottom right of the earlier quad. The same is true for cx. + //For a 1024 pixels, we need 512 bytes, the 2 extra, + // one for the non-existing earlier quad, and one for beyond the + // the end + const __m256i right_shift = _mm256_set_epi32( + 0, 7, 6, 5, 4, 3, 2, 1 + ); + + const __m256i left_shift = _mm256_set_epi32( + 6, 5, 4, 3, 2, 1, 0, 7 + ); + + ui32 n_loop = (width + 15) / 16; + + __m256i e_val_vec[65]; + for (ui32 i = 0; i > 3); */ + __m256i tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8)); + cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3); + + prev_e_val_vec = ZERO; + prev_cx_val_vec = ZERO; + + ui32 *sp = buf + y * stride; + + /* 16 bytes per iteration */ + for (ui32 x = 0; x < n_loop; ++x) { + + /* t = sp[i]; */ + if ((x == (n_loop - 1)) && (_width % 16)) { + ui32 tmp_buf[16] = { 0 }; + memcpy(tmp_buf, sp, (_width % 16) * sizeof(ui32)); + src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf)); + src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8)); + if (y + 1 < height) { + memcpy(tmp_buf, sp + stride, (_width % 16) * sizeof(ui32)); + src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf)); + src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8)); + } + else { + src_vec[1] = ZERO; + src_vec[3] = ZERO; + } + } + else { + src_vec[0] = _mm256_loadu_si256((__m256i*)(sp)); + src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8)); + + if (y + 1 < height) { + src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride)); + src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride)); + } + else { + src_vec[1] = ZERO; + src_vec[3] = ZERO; + } + sp += 16; + } + + /* src_vec layout: + * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5],.[0, 6],.[0, 7] + * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5],.[1, 6],.[1, 7] + * src_vec[2]:[0, 8],[0, 9],[0,10],[0,11],[0,12],[0,13],.[0,14], [0,15] + * src_vec[3]:[1, 8],[1, 9],[1,10],[1,11],[1,12],[1,13],.[1,14], [1,15] + */ + __m256i rho_vec, e_qmax_vec; + proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec); + + // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1; + tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift); + tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7); + + auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]); + max_e_vec = _mm256_sub_epi32(max_e_vec, ONE); + + // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1; + tmp = _mm256_max_epi32(max_e_vec, ONE); + __m256i tmp1 = _mm256_sub_epi32(rho_vec, ONE); + tmp1 = _mm256_and_si256(rho_vec, tmp1); + + auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO); + auto kappa_vec1_ = _mm256_and_si256(cmp, ONE); + auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp); + const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_); + + /* cq[1 - 16] = cq_vec + * cq[0] = prev_cq_vec[0] + */ + tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift); + + auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift); + cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0); + prev_cq = (ui32)_mm256_extract_epi32(tmp, 7); + + update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift); + update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift); + + /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */ + /* u_q[i] = Uq[i] - kappa[i]; */ + auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec); + auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec); + + auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec); + __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl); + ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0; + + proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore, + right_shift); + + proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec); + + // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7); + // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7); + ui32 u_q[8]; + ui32 tuple[8]; + /* The tuple is scaled by 4 due to: + * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true); + * So in the vlc_encode, the tuple will only be scaled by 2. + */ + tuple_vec = _mm256_srli_epi32(tuple_vec, 4); + _mm256_storeu_si256((__m256i*)tuple, tuple_vec); + _mm256_storeu_si256((__m256i*)u_q, u_q_vec); + + proc_vlc_encode(&vlc, tuple, u_q, _ignore); + } + + tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift); + tmp = _mm256_slli_epi32(tmp, 2); + tmp = _mm256_add_epi32(tmp, cx_val_vec[0]); + prev_cq = (ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp)); + + proc_cq = proc_cq2; + vlc_tbl = vlc_tbl1; + proc_mel_encode = proc_mel_encode2; + proc_vlc_encode = proc_vlc_encode2; + } + + ms_terminate(&ms); + terminate_mel_vlc(&mel, &vlc); + + //copy to elastic + lengths[0] = mel.pos + vlc.pos + ms.pos; + elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded); + memcpy(coded->buf, ms.buf, ms.pos); + memcpy(coded->buf + ms.pos, mel.buf, mel.pos); + memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos); + + // put in the interface locator word + ui32 num_bytes = mel.pos + vlc.pos; + coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4); + coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0; + coded->buf[lengths[0]-2] = + (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF)); + + coded->avail_size -= lengths[0]; +} + +} /* namespace local */ +} /* namespace ojph */ From cc8cec43c5f386974a5a4108e5c4e73964f0fe7d Mon Sep 17 00:00:00 2001 From: Tomasz Szumski Date: Mon, 3 Jun 2024 13:06:58 +0200 Subject: [PATCH 03/78] ojph_decode_codeblock_avx2() implementation --- src/core/CMakeLists.txt | 2 + src/core/codestream/ojph_codeblock_fun.cpp | 1 + src/core/coding/ojph_block_decoder.h | 6 + src/core/coding/ojph_block_decoder_avx2.cpp | 2041 +++++++++++++++++++ 4 files changed, 2050 insertions(+) create mode 100644 src/core/coding/ojph_block_decoder_avx2.cpp diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 94bcfcc3..1c6856a2 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -86,6 +86,7 @@ else() if (MSVC) set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX") set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") + set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2") set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512") set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX") @@ -97,6 +98,7 @@ else() set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS -mavx) set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) set_source_files_properties(coding/ojph_block_decoder_ssse3.cpp PROPERTIES COMPILE_FLAGS -mssse3) + set_source_files_properties(coding/ojph_block_decoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) set_source_files_properties(coding/ojph_block_encoder_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2) set_source_files_properties(coding/ojph_block_encoder_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512cd) set_source_files_properties(transform/ojph_colour_avx.cpp PROPERTIES COMPILE_FLAGS -mavx) diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index 732d5263..51253c1b 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -170,6 +170,7 @@ namespace ojph { tx_from_cb = avx2_irv_tx_from_cb; } encode_cb = ojph_encode_codeblock_avx2; + decode_cb = ojph_decode_codeblock_avx2; } #endif // !OJPH_DISABLE_AVX2 diff --git a/src/core/coding/ojph_block_decoder.h b/src/core/coding/ojph_block_decoder.h index dcd3220b..ab019617 100644 --- a/src/core/coding/ojph_block_decoder.h +++ b/src/core/coding/ojph_block_decoder.h @@ -60,6 +60,12 @@ namespace ojph { ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal); + // AVX2-accelerated decoder + bool + ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, bool stripe_causal); + // WASM SIMD-accelerated decoder bool ojph_decode_codeblock_wasm(ui8* coded_data, ui32* decoded_data, diff --git a/src/core/coding/ojph_block_decoder_avx2.cpp b/src/core/coding/ojph_block_decoder_avx2.cpp new file mode 100644 index 00000000..e7270a75 --- /dev/null +++ b/src/core/coding/ojph_block_decoder_avx2.cpp @@ -0,0 +1,2041 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2022, Aous Naman +// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2022, The University of New South Wales, Australia +// Copyright (c) 2024, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_block_decoder_avx2.cpp +//***************************************************************************/ + +//***************************************************************************/ +/** @file ojph_block_decoder_avx2.cpp + * @brief implements a faster HTJ2K block decoder using avx2 + */ + +#include +#include + +#include +#include +#include "ojph_block_common.h" +#include "ojph_block_decoder.h" +#include "ojph_arch.h" +#include "ojph_message.h" + +#include + +namespace ojph { + namespace local { + + //************************************************************************/ + /** @brief MEL state structure for reading and decoding the MEL bitstream + * + * A number of events is decoded from the MEL bitstream ahead of time + * and stored in run/num_runs. + * Each run represents the number of zero events before a one event. + */ + struct dec_mel_st { + dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false), + k(0), num_runs(0), runs(0) + {} + // data decoding machinery + ui8* data; //!bits > 32) //there are enough bits in the tmp variable + return; // return without reading new data + + ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted + if (melp->size > 4) { // if there is data in the MEL segment + val = *(ui32*)melp->data; // read 32 bits from MEL data + melp->data += 4; // advance pointer + melp->size -= 4; // reduce counter + } + else if (melp->size > 0) + { // 4 or less + int i = 0; + while (melp->size > 1) { + ui32 v = *melp->data++; // read one byte at a time + ui32 m = ~(0xFFu << i); // mask of location + val = (val & m) | (v << i);// put one byte in its correct location + --melp->size; + i += 8; + } + // size equal to 1 + ui32 v = *melp->data++; // the one before the last is different + v |= 0xF; // MEL and VLC segments can overlap + ui32 m = ~(0xFFu << i); + val = (val & m) | (v << i); + --melp->size; + } + + // next we unstuff them before adding them to the buffer + int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if + // the previously read byte requires + // unstuffing + + // data is unstuffed and accumulated in t + // bits has the number of bits in t + ui32 t = val & 0xFF; + bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing + bits -= unstuff; // there is one less bit in t if unstuffing is needed + t = t << (8 - unstuff); // move up to make room for the next byte + + //this is a repeat of the above + t |= (val>>8) & 0xFF; + unstuff = (((val >> 8) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>16) & 0xFF; + unstuff = (((val >> 16) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>24) & 0xFF; + melp->unstuff = (((val >> 24) & 0xFF) == 0xFF); + + // move t to tmp, and push the result all the way up, so we read from + // the MSB + melp->tmp |= ((ui64)t) << (64 - bits - melp->bits); + melp->bits += bits; //increment the number of bits in tmp + } + + //************************************************************************/ + /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs + * + * Runs are stored in "runs" and the number of runs in "num_runs". + * Each run represents a number of zero events that may or may not + * terminate in a 1 event. + * Each run is stored in 7 bits. The LSB is 1 if the run terminates in + * a 1 event, 0 otherwise. The next 6 bits, for the case terminating + * with 1, contain the number of consecutive 0 zero events * 2; for the + * case terminating with 0, they store (number of consecutive 0 zero + * events - 1) * 2. + * A total of 6 bits (made up of 1 + 5) should have been enough. + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + void mel_decode(dec_mel_st *melp) + { + static const int mel_exp[13] = { //MEL exponents + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5 + }; + + if (melp->bits < 6) // if there are less than 6 bits in tmp + mel_read(melp); // then read from the MEL bitstream + // 6 bits is the largest decodable MEL cwd + + //repeat so long that there is enough decodable bits in tmp, + // and the runs store is not full (num_runs < 8) + while (melp->bits >= 6 && melp->num_runs < 8) + { + int eval = mel_exp[melp->k]; // number of bits associated with state + int run = 0; + if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB) + { //one is found + run = 1 << eval; + run--; // consecutive runs of 0 events - 1 + melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12 + melp->tmp <<= 1; // consume one bit from tmp + melp->bits -= 1; + run = run << 1; // a stretch of zeros not terminating in one + } + else + { //0 is found + run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1); + melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0 + melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6) + melp->bits -= eval + 1; + run = (run << 1) + 1; // a stretch of zeros terminating with one + } + eval = melp->num_runs * 7; // 7 bits per run + melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient + melp->runs |= ((ui64)run) << eval; // store the value in runs + melp->num_runs++; // increment count + } + } + + //************************************************************************/ + /** @brief Initiates a dec_mel_st structure for MEL decoding and reads + * some bytes in order to get the read address to a multiple + * of 4 + * + * @param [in] melp is a pointer to dec_mel_st structure + * @param [in] bbuf is a pointer to byte buffer + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup) + { + melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL + melp->bits = 0; // 0 bits in tmp + melp->tmp = 0; // + melp->unstuff = false; // no unstuffing + melp->size = scup - 1; // size is the length of MEL+VLC-1 + melp->k = 0; // 0 for state + melp->num_runs = 0; // num_runs is 0 + melp->runs = 0; // + + //This code is borrowed; original is for a different architecture + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment + int num = 4 - (int)(intptr_t(melp->data) & 0x3); + for (int i = 0; i < num; ++i) { // this code is similar to mel_read + assert(melp->unstuff == false || melp->data[0] <= 0x8F); + ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed + //set data to 0xFF + if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF + // see the standard + melp->data += melp->size-- > 0; //increment if the end is not reached + int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1 + melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp + melp->bits += d_bits; //increment tmp by number of bits + melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs + //unstuffing + } + melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit + // is the MSB + } + + //************************************************************************/ + /** @brief Retrieves one run from dec_mel_st; if there are no runs stored + * MEL segment is decoded + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + int mel_get_run(dec_mel_st *melp) + { + if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment + mel_decode(melp); + + int t = melp->runs & 0x7F; //retrieve one run + melp->runs >>= 7; // remove the retrieved run + melp->num_runs--; + return t; // return run + } + + //************************************************************************/ + /** @brief A structure for reading and unstuffing a segment that grows + * backward, such as VLC and MRP + */ + struct rev_struct { + rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false) + {} + //storage + ui8* data; //!bits > 32) // if there are more than 32 bits in tmp, then + return; // reading 32 bits can overflow vlcp->tmp + ui32 val = 0; + //the next line (the if statement) needs to be tested first + if (vlcp->size > 3) // if there are more than 3 bytes left in VLC + { + // (vlcp->data - 3) move pointer back to read 32 bits at once + val = *(ui32*)(vlcp->data - 3); // then read 32 bits + vlcp->data -= 4; // move data pointer back by 4 + vlcp->size -= 4; // reduce available byte by 4 + } + else if (vlcp->size > 0) + { // 4 or less + int i = 24; + while (vlcp->size > 0) { + ui32 v = *vlcp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --vlcp->size; + i -= 8; + } + } + + __m128i tmp_vec = _mm_set1_epi32((int32_t)val); + tmp_vec = _mm_srlv_epi32(tmp_vec, _mm_setr_epi32(24, 16, 8, 0)); + tmp_vec = _mm_and_si128(tmp_vec, _mm_set1_epi32(0xff)); + + __m128i unstuff_vec = _mm_cmpgt_epi32(tmp_vec, _mm_set1_epi32(0x8F)); + bool unstuff_next = _mm_extract_epi32(unstuff_vec, 3); + unstuff_vec = _mm_slli_si128(unstuff_vec, 4); + unstuff_vec = _mm_insert_epi32(unstuff_vec, vlcp->unstuff * 0xffffffff, 0); + + __m128i val_7f = _mm_set1_epi32(0x7F); + __m128i this_byte_7f = _mm_cmpeq_epi32(_mm_and_si128(tmp_vec, val_7f), val_7f); + unstuff_vec = _mm_and_si128(unstuff_vec, this_byte_7f); + unstuff_vec = _mm_srli_epi32(unstuff_vec, 31); + + __m128i inc_sum = _mm_sub_epi32(_mm_set1_epi32(8), unstuff_vec); + inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4)); + inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8)); + ui32 total_bits = (ui32)_mm_extract_epi32(inc_sum, 3); + + __m128i final_shift = _mm_slli_si128(inc_sum, 4); + tmp_vec = _mm_sllv_epi32(tmp_vec, final_shift); + tmp_vec = _mm_or_si128(tmp_vec, _mm_bsrli_si128(tmp_vec, 8)); + + ui64 tmp = (ui32)_mm_cvtsi128_si32(tmp_vec) | (ui32)_mm_extract_epi32(tmp_vec, 1); + + vlcp->unstuff = unstuff_next; + vlcp->tmp |= tmp << vlcp->bits; + vlcp->bits += total_bits; + } + + //************************************************************************/ + /** @brief Initiates the rev_struct structure and reads a few bytes to + * move the read address to multiple of 4 + * + * There is another similar rev_init_mrp subroutine. The difference is + * that this one, rev_init, discards the first 12 bits (they have the + * sum of the lengths of VLC and MEL segments), and first unstuff depends + * on first 4 bits. + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup) + { + //first byte has only the upper 4 bits + vlcp->data = data + lcup - 2; + + //size can not be larger than this, in fact it should be smaller + vlcp->size = scup - 2; + + ui32 d = *vlcp->data--; // read one byte (this is a half byte) + vlcp->tmp = d >> 4; // both initialize and set + vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard + vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream. + // To read 32 bits, read from (vlcp->data - 3) + int num = 1 + (int)(intptr_t(vlcp->data) & 0x3); + int tnum = num < vlcp->size ? num : vlcp->size; + for (int i = 0; i < tnum; ++i) { + ui64 d; + d = *vlcp->data--; // read one byte and move read pointer + //check if the last byte was >0x8F (unstuff == true) and this is 0x7F + ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp + vlcp->bits += d_bits; + vlcp->unstuff = d > 0x8F; // for next byte + } + vlcp->size -= tnum; + rev_read(vlcp); // read another 32 buts + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, vlcp->tmp must have no less than 33 bits + * + * @param [in] vlcp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch(rev_struct *vlcp) + { + if (vlcp->bits < 32) // if there are less then 32 bits, read more + { + rev_read(vlcp); // read 32 bits, but unstuffing might reduce this + if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits + rev_read(vlcp); // read another 32 + } + return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui32 rev_advance(rev_struct *vlcp, ui32 num_bits) + { + assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits + vlcp->tmp >>= num_bits; // remove bits + vlcp->bits -= num_bits; // decrement the number of bits + return (ui32)vlcp->tmp; + } + + //************************************************************************/ + /** @brief Reads and unstuffs from rev_struct + * + * This is different than rev_read in that this fills in zeros when the + * the available data is consumed. The other does not care about the + * values when all data is consumed. + * + * See rev_read for more information about unstuffing + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + void rev_read_mrp(rev_struct *mrp) + { + //process 4 bytes at a time + if (mrp->bits > 32) + return; + ui32 val = 0; + if (mrp->size > 3) // If there are 3 byte or more + { // (mrp->data - 3) move pointer back to read 32 bits at once + val = *(ui32*)(mrp->data - 3); // read 32 bits + mrp->data -= 4; // move back pointer + mrp->size -= 4; // reduce count + } + else if (mrp->size > 0) + { + int i = 24; + while (mrp->size > 0) { + ui32 v = *mrp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --mrp->size; + i -= 8; + } + } + + //accumulate in tmp, and keep count in bits + ui32 bits, tmp = val >> 24; + + //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F + bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0); + bool unstuff = (val >> 24) > 0x8F; + + //process the next byte + tmp |= ((val >> 16) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 16) & 0xFF) > 0x8F; + + tmp |= ((val >> 8) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 8) & 0xFF) > 0x8F; + + tmp |= (val & 0xFF) << bits; + bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = (val & 0xFF) > 0x8F; + + mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer + mrp->bits += bits; + mrp->unstuff = unstuff; // next byte + } + + //************************************************************************/ + /** @brief Initialized rev_struct structure for MRP segment, and reads + * a number of bytes such that the next 32 bits read are from + * an address that is a multiple of 4. Note this is designed for + * an architecture that read size must be compatible with the + * alignment of the read address + * + * There is another similar subroutine rev_init. This subroutine does + * NOT skip the first 12 bits, and starts with unstuff set to true. + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] len2 is the length of SPP+MRP segments + */ + static inline + void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2) + { + mrp->data = data + lcup + len2 - 1; + mrp->size = len2; + mrp->unstuff = true; + mrp->bits = 0; + mrp->tmp = 0; + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream + int num = 1 + (int)(intptr_t(mrp->data) & 0x3); + for (int i = 0; i < num; ++i) { + ui64 d; + //read a byte, 0 if no more data + d = (mrp->size-- > 0) ? *mrp->data-- : 0; + //check if unstuffing is needed + ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp + mrp->bits += d_bits; + mrp->unstuff = d > 0x8F; // for next byte + } + rev_read_mrp(mrp); + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, mrp->tmp must have no less than 33 bits + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch_mrp(rev_struct *mrp) + { + if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp + { + rev_read_mrp(mrp); // read 30-32 bits from mrp + if (mrp->bits < 32) // if there is a space of 32 bits + rev_read_mrp(mrp); // read more + } + return (ui32)mrp->tmp; // return the head of mrp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits) + { + assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits + mrp->tmp >>= num_bits; // discard the lowest num_bits bits + mrp->bits -= num_bits; + return (ui32)mrp->tmp; // return data after consumption + } + + //************************************************************************/ + /** @brief State structure for reading and unstuffing of forward-growing + * bitstreams; these are: MagSgn and SPP bitstreams + */ + struct frwd_struct { + const ui8* data; //! + static inline + void frwd_read(frwd_struct *msp) + { + assert(msp->bits <= 128); + + __m128i offset, val, validity, all_xff; + val = _mm_loadu_si128((__m128i*)msp->data); + int bytes = msp->size >= 16 ? 16 : msp->size; + validity = _mm_set1_epi8((char)bytes); + msp->data += bytes; + msp->size -= bytes; + int bits = 128; + offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100); + validity = _mm_cmpgt_epi8(validity, offset); + all_xff = _mm_set1_epi8(-1); + if (X == 0xFF) // the compiler should remove this if statement + { + __m128i t = _mm_xor_si128(validity, all_xff); // complement + val = _mm_or_si128(t, val); // fill with 0xFF + } + else if (X == 0) + val = _mm_and_si128(validity, val); // fill with zeros + else + assert(0); + + __m128i ff_bytes; + ff_bytes = _mm_cmpeq_epi8(val, all_xff); + ff_bytes = _mm_and_si128(ff_bytes, validity); + ui32 flags = (ui32)_mm_movemask_epi8(ff_bytes); + flags <<= 1; // unstuff following byte + ui32 next_unstuff = flags >> 16; + flags |= msp->unstuff; + flags &= 0xFFFF; + while (flags) + { // bit unstuffing occurs on average once every 256 bytes + // therefore it is not an issue if it is a bit slow + // here we process 16 bytes + --bits; // consuming one stuffing bit + + ui32 loc = 31 - count_leading_zeros(flags); + flags ^= 1 << loc; + + __m128i m, t, c; + t = _mm_set1_epi8((char)loc); + m = _mm_cmpgt_epi8(offset, t); + + t = _mm_and_si128(m, val); // keep bits at locations larger than loc + c = _mm_srli_epi64(t, 1); // 1 bits left + t = _mm_srli_si128(t, 8); // 8 bytes left + t = _mm_slli_epi64(t, 63); // keep the MSB only + t = _mm_or_si128(t, c); // combine the above 3 steps + + val = _mm_or_si128(t, _mm_andnot_si128(m, val)); + } + + // combine with earlier data + assert(msp->bits >= 0 && msp->bits <= 128); + int cur_bytes = msp->bits >> 3; + int cur_bits = msp->bits & 7; + __m128i b1, b2; + b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits)); + b2 = _mm_slli_si128(val, 8); // 8 bytes right + b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits)); + b1 = _mm_or_si128(b1, b2); + b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes)); + b2 = _mm_or_si128(b1, b2); + _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2); + + int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits; + cur_bytes = (msp->bits + (ui32)consumed_bits + 7) >> 3; // round up + int upper = _mm_extract_epi16(val, 7); + upper >>= consumed_bits - 128 + 16; + msp->tmp[cur_bytes] = (ui8)upper; // copy byte + + msp->bits += (ui32)bits; + msp->unstuff = next_unstuff; // next unstuff + assert(msp->unstuff == 0 || msp->unstuff == 1); + } + + //************************************************************************/ + /** @brief Initialize frwd_struct struct and reads some bytes + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + * @param [in] data is a pointer to the start of data + * @param [in] size is the number of byte in the bitstream + */ + template + static inline + void frwd_init(frwd_struct *msp, const ui8* data, int size) + { + msp->data = data; + _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128()); + _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128()); + _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128()); + + msp->bits = 0; + msp->unstuff = 0; + msp->size = size; + + frwd_read(msp); // read 128 bits more + } + + //************************************************************************/ + /** @brief Consume num_bits bits from the bitstream of frwd_struct + * + * @param [in] msp is a pointer to frwd_struct + * @param [in] num_bits is the number of bit to consume + */ + static inline + void frwd_advance(frwd_struct *msp, ui32 num_bits) + { + assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128); + msp->bits -= num_bits; + + __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18)); + num_bits &= 63; + + __m128i v0, v1, c0, c1, t; + v0 = _mm_loadu_si128(p); + v1 = _mm_loadu_si128(p + 1); + + // shift right by num_bits + c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits)); + t = _mm_srli_si128(v0, 8); + t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits)); + c0 = _mm_or_si128(c0, t); + t = _mm_slli_si128(v1, 8); + t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits)); + c0 = _mm_or_si128(c0, t); + + _mm_storeu_si128((__m128i*)msp->tmp, c0); + + c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits)); + t = _mm_srli_si128(v1, 8); + t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits)); + c1 = _mm_or_si128(c1, t); + + _mm_storeu_si128((__m128i*)msp->tmp + 1, c1); + } + + //************************************************************************/ + /** @brief Fetches 32 bits from the frwd_struct bitstream + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + */ + template + static inline + __m128i frwd_fetch(frwd_struct *msp) + { + if (msp->bits <= 128) + { + frwd_read(msp); + if (msp->bits <= 128) //need to test + frwd_read(msp); + } + __m128i t = _mm_loadu_si128((__m128i*)msp->tmp); + return t; + } + + //************************************************************************/ + /** @brief decodes twos consecutive quads (one octet), using 32 bit data + * + * @param inf_u_q decoded VLC code, with interleaved u values + * @param U_q U values + * @param magsgn structure for forward data buffer + * @param p bitplane at which we are decoding + * @param vn used for handling E values (stores v_n values) + * @return __m256i decoded two quads + */ + static inline __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q, frwd_struct* magsgn, ui32 p, __m128i& vn) { + __m256i row = _mm256_setzero_si256(); + + // we keeps e_k, e_1, and rho in w2 + __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110)); + __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256()); + + if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant? + { + flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8)); + + // U_q holds U_q for this quad + // flags has e_k, e_1, and rho such that e_k is sitting in the + // 0x8000, e_1 in 0x800, and rho in 0x80 + + // next e_k and m_n + __m256i m_n; + __m256i w0 = _mm256_srli_epi32(flags, 15); // e_k + m_n = _mm256_sub_epi32(U_q, w0); + m_n = _mm256_andnot_si256(insig, m_n); + + // find cumulative sums + // to find at which bit in ms_vec the sample starts + __m256i inc_sum = m_n; // inclusive scan + inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4)); + inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8)); + int total_mn1 = _mm256_extract_epi16(inc_sum, 6); + int total_mn2 = _mm256_extract_epi16(inc_sum, 14); + + __m128i ms_vec0 = _mm_setzero_si128(); + __m128i ms_vec1 = _mm_setzero_si128(); + if (total_mn1) { + ms_vec0 = frwd_fetch<0xFF>(magsgn); + frwd_advance(magsgn, (ui32)total_mn1); + } + if (total_mn2) { + ms_vec1 = frwd_fetch<0xFF>(magsgn); + frwd_advance(magsgn, (ui32)total_mn2); + } + + __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1); + + __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4); // exclusive scan + + // find the starting byte and starting bit + __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3); + __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7)); + byte_idx = _mm256_shuffle_epi8(byte_idx, + _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000)); + byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100)); + __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx); + byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101)); + __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx); + + // shift samples values to correct location + bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16)); + + __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1); + __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1); + + __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx); + bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101)); + d0 = _mm256_mullo_epi16(d0, bit_shift); + d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB + d1 = _mm256_mullo_epi16(d1, bit_shift); + d1 = _mm256_and_si256(d1, _mm256_set1_epi32((si32)0xFF00FF00)); // 8 in MSB + d0 = _mm256_or_si256(d0, d1); + + // find location of e_k and mask + __m256i shift; + __m256i ones = _mm256_set1_epi32(1); + __m256i twos = _mm256_set1_epi32(2); + __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones); + U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F)); + U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0); + w0 = _mm256_sub_epi32(twos, w0); + shift = _mm256_sllv_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31 + ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones)); + + // next e_1 + w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800)); + w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256()); + w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position + ms_vec = _mm256_or_si256(ms_vec, w0); // e_1 + w0 = _mm256_slli_epi32(ms_vec, 31); // sign + ms_vec = _mm256_or_si256(ms_vec, ones); // bin center + __m256i tvn = ms_vec; + ms_vec = _mm256_add_epi32(ms_vec, twos);// + 2 + ms_vec = _mm256_slli_epi32(ms_vec, (si32)p - 1); + ms_vec = _mm256_or_si256(ms_vec, w0); // sign + row = _mm256_andnot_si256(insig, ms_vec); // significant only + + ms_vec = _mm256_andnot_si256(insig, tvn); // significant only + + tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504)); + + vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn)); + vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1)); + } + return row; + } + + + //************************************************************************/ + /** @brief decodes twos consecutive quads (one octet), using 16 bit data + * + * @param inf_u_q decoded VLC code, with interleaved u values + * @param U_q U values + * @param magsgn structure for forward data buffer + * @param p bitplane at which we are decoding + * @param vn used for handling E values (stores v_n values) + * @return __m128i decoded quad + */ + + static inline __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct* magsgn, ui32 p, __m128i& vn) { + + __m256i w0; // workers + __m256i insig; // lanes hold FF's if samples are insignificant + __m256i flags; // lanes hold e_k, e_1, and rho + + __m256i row = _mm256_setzero_si256(); + __m128i ddd = _mm_shuffle_epi8(inf_u_q, + _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100)); + w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd), + _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3)); + // we keeps e_k, e_1, and rho in w2 + flags = _mm256_and_si256(w0, + _mm256_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110, + (si16)0x8880, 0x4440, 0x2220, 0x1110, + (si16)0x8880, 0x4440, 0x2220, 0x1110, + (si16)0x8880, 0x4440, 0x2220, 0x1110)); + insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256()); + if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant? + { + ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q); + __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd), + _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3)); + flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8)); + + // U_q holds U_q for this quad + // flags has e_k, e_1, and rho such that e_k is sitting in the + // 0x8000, e_1 in 0x800, and rho in 0x80 + + // next e_k and m_n + __m256i m_n; + w0 = _mm256_srli_epi16(flags, 15); // e_k + m_n = _mm256_sub_epi16(U_q_avx, w0); + m_n = _mm256_andnot_si256(insig, m_n); + + // find cumulative sums + // to find at which bit in ms_vec the sample starts + __m256i inc_sum = m_n; // inclusive scan + inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2)); + inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4)); + inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8)); + int total_mn1 = _mm256_extract_epi16(inc_sum, 7); + int total_mn2 = _mm256_extract_epi16(inc_sum, 15); + __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2); // exclusive scan + + __m128i ms_vec0 = _mm_setzero_si128(); + __m128i ms_vec1 = _mm_setzero_si128(); + if (total_mn1) { + ms_vec0 = frwd_fetch<0xFF>(magsgn); + frwd_advance(magsgn, (ui32)total_mn1); + } + if (total_mn2) { + ms_vec1 = frwd_fetch<0xFF>(magsgn); + frwd_advance(magsgn, (ui32)total_mn2); + } + + __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1); + + // find the starting byte and starting bit + __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3); + __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7)); + byte_idx = _mm256_shuffle_epi8(byte_idx, + _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808, + 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808, + 0x0606, 0x0404, 0x0202, 0x0000)); + byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100)); + __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx); + byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101)); + __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx); + + // shift samples values to correct location + __m256i bit_shift = _mm256_shuffle_epi8( + _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, + 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1, + 1, 3, 7, 15, 31, 63, 127, -1), bit_idx); + bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101)); + d0 = _mm256_mullo_epi16(d0, bit_shift); + d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB + d1 = _mm256_mullo_epi16(d1, bit_shift); + d1 = _mm256_and_si256(d1, _mm256_set1_epi16((si16)0xFF00)); // 8 in MSB + d0 = _mm256_or_si256(d0, d1); + + // find location of e_k and mask + __m256i shift, t0, t1, Uq0, Uq1; + __m256i ones = _mm256_set1_epi16(1); + __m256i twos = _mm256_set1_epi16(2); + __m256i U_q_m1 = _mm256_sub_epi32(U_q_avx, ones); + Uq0 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F)); + Uq1 = _mm256_bsrli_epi128(U_q_m1, 14); + w0 = _mm256_sub_epi16(twos, w0); + t0 = _mm256_and_si256(w0, _mm256_set_epi64x(0, -1, 0, -1)); + t1 = _mm256_and_si256(w0, _mm256_set_epi64x(-1, 0, -1, 0)); + {//no _mm256_sllv_epi16 in avx2 + __m128i t_0_sse = _mm256_castsi256_si128(t0); + t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq0)); + __m128i t_1_sse = _mm256_extracti128_si256(t0 , 0x1); + t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq0, 0x1)); + t0 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1); + + t_0_sse = _mm256_castsi256_si128(t1); + t_0_sse = _mm_sll_epi16(t_0_sse, _mm256_castsi256_si128(Uq1)); + t_1_sse = _mm256_extracti128_si256(t1, 0x1); + t_1_sse = _mm_sll_epi16(t_1_sse, _mm256_extracti128_si256(Uq1, 0x1)); + t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(t_0_sse), t_1_sse, 0x1); + } + shift = _mm256_or_si256(t0, t1); + ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones)); + + // next e_1 + w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800)); + w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256()); + w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position + ms_vec = _mm256_or_si256(ms_vec, w0); // e_1 + w0 = _mm256_slli_epi16(ms_vec, 15); // sign + ms_vec = _mm256_or_si256(ms_vec, ones); // bin center + __m256i tvn = ms_vec; + ms_vec = _mm256_add_epi16(ms_vec, twos);// + 2 + ms_vec = _mm256_slli_epi16(ms_vec, (si32)p - 1); + ms_vec = _mm256_or_si256(ms_vec, w0); // sign + row = _mm256_andnot_si256(insig, ms_vec); // significant only + + ms_vec = _mm256_andnot_si256(insig, tvn); // significant only + + __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec, + _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1, + -1, -1, -1, -1, -1, -1, 0x0706, 0x0302)); + __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec, + _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1, + -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1)); + ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2); + + vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec)); + vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1)); + } + return row; + } + + // https://stackoverflow.com/a/58827596 + inline __m256i avx2_lzcnt_epi32(__m256i v) { + // prevent value from being rounded up to the next power of two + v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB + + v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float + v = _mm256_srli_epi32(v, 23); // shift down the exponent + v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias + v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32 + + return v; + } + + //************************************************************************/ + /** @brief Decodes one codeblock, processing the cleanup, siginificance + * propagation, and magnitude refinement pass + * + * @param [in] coded_data is a pointer to bitstream + * @param [in] decoded_data is a pointer to decoded codeblock data buf. + * @param [in] missing_msbs is the number of missing MSBs + * @param [in] num_passes is the number of passes: 1 if CUP only, + * 2 for CUP+SPP, and 3 for CUP+SPP+MRP + * @param [in] lengths1 is the length of cleanup pass + * @param [in] lengths2 is the length of refinement passes (either SPP + * only or SPP+MRP) + * @param [in] width is the decoded codeblock width + * @param [in] height is the decoded codeblock height + * @param [in] stride is the decoded codeblock buffer stride + * @param [in] stripe_causal is true for stripe causal mode + */ + bool ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) + { + static bool insufficient_precision = false; + static bool modify_code = false; + static bool truncate_spp_mrp = false; + + if (num_passes > 1 && lengths2 == 0) + { + OJPH_WARN(0x00010001, "A malformed codeblock that has more than " + "one coding pass, but zero length for " + "2nd and potential 3rd pass"); + num_passes = 1; + } + + if (num_passes > 3) + { + OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " + "This codeblocks has %d passes", + num_passes); + return false; + } + + if (missing_msbs > 30) // p < 0 + { + if (insufficient_precision == false) + { + insufficient_precision = true; + OJPH_WARN(0x00010003, "32 bits are not enough to decode this " + "codeblock. This message will not be " + "displayed again"); + } + return false; + } + else if (missing_msbs == 30) // p == 0 + { // not enough precision to decode and set the bin center to 1 + if (modify_code == false) { + modify_code = true; + OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " + "pass. The code can be modified to support " + "this case. This message will not be " + "displayed again"); + } + return false; // 32 bits are not enough to decode this + } + else if (missing_msbs == 29) // if p is 1, then num_passes must be 1 + { + if (num_passes > 1) { + num_passes = 1; + if (truncate_spp_mrp == false) { + truncate_spp_mrp = true; + OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " + "nor MagRef passes; both will be skipped. " + "This message will not be displayed " + "again"); + } + } + } + ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP + // There is a way to handle the case of p == 0, but a different path + // is required + + if (lengths1 < 2) + { + OJPH_WARN(0x00010006, "Wrong codeblock length"); + return false; + } + + // read scup and fix the bytes there + int lcup, scup; + lcup = (int)lengths1; // length of CUP + //scup is the length of MEL + VLC + scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF); + if (scup < 2 || scup > lcup || scup > 4079) //something is wrong + return false; + + // The temporary storage scratch holds two types of data in an + // interleaved fashion. The interleaving allows us to use one + // memory pointer. + // We have one entry for a decoded VLC code, and one entry for UVLC. + // Entries are 16 bits each, corresponding to one quad, + // but since we want to use XMM registers of the SSE family + // of SIMD; we allocated 16 bytes or more per quad row; that is, + // the width is no smaller than 16 bytes (or 8 entries), and the + // height is 512 quads + // Each VLC entry contains, in the following order, starting + // from MSB + // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits) + // Each entry in UVLC contains u_q + // One extra row to handle the case of SPP propagating downwards + // when codeblock width is 4 + ui16 scratch[8 * 513] = {0}; // 8+ kB + + // We need an extra two entries (one inf and one u_q) beyond + // the last column. + // If the block width is 4 (2 quads), then we use sstr of 8 + // (enough for 4 quads). If width is 8 (4 quads) we use + // sstr is 16 (enough for 8 quads). For a width of 16 (8 + // quads), we use 24 (enough for 12 quads). + ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8 + + assert((stride & 0x3) == 0); + + ui32 mmsbp2 = missing_msbs + 2; + + // The cleanup pass is decoded in two steps; in step one, + // the VLC and MEL segments are decoded, generating a record that + // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k. + // This information should be sufficient for the next step. + // In step 2, we decode the MagSgn segment. + + // step 1 decoding VLC and MEL segments + { + // init structures + dec_mel_st mel; + mel_init(&mel, coded_data, lcup, scup); + rev_struct vlc; + rev_init(&vlc, coded_data, lcup, scup); + + int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm + // data represented as runs of 0 events + // See mel_decode description + + ui32 vlc_val; + ui32 c_q = 0; + ui16 *sp = scratch; + //initial quad row + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // first quad + vlc_val = rev_fetch(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 1 in ITU T.814 + c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2); + + //remove data from vlc stream (0 bits are removed if vlc is not used) + vlc_val = rev_advance(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + //prepare context for the next quad, eqn. 1 in ITU T.814 + c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2); + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from + { // the MEL run of events + run -= 2; //subtract 2, since events number if multiplied by 2 + + uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by + // is 0x40 + + if (run < 0)//if run is consumed (run is -1 or -2), get another run + run = mel_get_run(&mel); + } + //run -= (uvlc_mode == 0xc0) ? 2 : 0; + //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + + //decode uvlc_mode to get u for both quads + ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance(&vlc, len); + ojph_unused(vlc_val); //static code analysis: unused value + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<> 3) + (tmp >> len)); //kappa == 1 + sp[3] = u_q; + } + sp[0] = sp[1] = 0; + + //non initial quad rows + for (ui32 y = 2; y < height; y += 2) + { + c_q = 0; // context + ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads + + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // sigma_q (n, ne, nf) + c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4); + + // first quad + vlc_val = rev_fetch(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number is multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 2 in ITU T.814 + // sigma_q (w, sw) + c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[0 - (si32)sstr] & 0x80; + // sigma_q (n, ne, nf) + c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4); + + //remove data from vlc stream (0 bits are removed if vlc is unused) + vlc_val = rev_advance(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + // partial c_q, will be completed when we process the next quad + // sigma_q (w, sw) + c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[2 - (si32)sstr] & 0x80; + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance(&vlc, len); + ojph_unused(vlc_val); //static code analysis: unused value + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q + sp[1] = u_q; + u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q + sp[3] = u_q; + } + sp[0] = sp[1] = 0; + } + } + + // step2 we decode magsgn + // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit) + // The 32 bit path decode 16 bits data, for which one would think + // 16 bits are enough, because we want to put in the center of the + // bin. + // If you have mmsbp2 equals 16 bit, and reversible coding, and + // no bitplanes are missing, then we can decoding using the 16 bit + // path, but we are not doing this here. + if (mmsbp2 >= 16) + { + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We may go beyond the last entry by up to 4 entries. + // Here we allocate additional 8 entries. + // There are two rows in this structure, the bottom + // row is used to store processed entries. + const int v_n_size = 512 + 16; + ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB + + frwd_struct magsgn; + frwd_init<0xFF>(&magsgn, coded_data, lcup - scup); + + const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2); + + { + ui16 *sp = scratch; + ui32 *vp = v_n_scratch; + ui32 *dp = decoded_data; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) + { + __m128i vn = _mm_set1_epi32(2); + + __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp)); + inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1)); + + __m256i U_q = _mm256_srli_epi32(inf_u_q, 16); + __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2); + if (!_mm256_testz_si256(w, w)) { + return false; + } + + __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn); + row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row)); + _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1)); + + __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp); + w0 = _mm_or_si128(w0, vn); + _mm_storeu_si128((__m128i*)vp, w0); + } + } + + for (ui32 y = 2; y < height; y += 2) + { + { + // perform 31 - count_leading_zeros(*vp) here + ui32 *vp = v_n_scratch; + ui16* sp = scratch + (y >> 1) * sstr; + + const __m256i avx_31 = _mm256_set1_epi32(31); + const __m256i avx_f0 = _mm256_set1_epi32(0xF0); + const __m256i avx_1 = _mm256_set1_epi32(1); + const __m256i avx_0 = _mm256_setzero_si256(); + + for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) { + __m256i v = _mm256_loadu_si256((__m256i*)vp); + __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1)); + v = _mm256_or_si256(v, v_p1); + v = avx2_lzcnt_epi32(v); + v = _mm256_sub_epi32(avx_31, v); + + __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp); + __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0); + __m256i w0 = _mm256_sub_epi32(gamma, avx_1); + gamma = _mm256_and_si256(gamma, w0); + gamma = _mm256_cmpeq_epi32(gamma, avx_0); + + v = _mm256_andnot_si256(gamma, v); + v = _mm256_max_epi32(v, avx_1); + + inf_u_q = _mm256_srli_epi32(inf_u_q, 16); + v = _mm256_add_epi32(inf_u_q, v); + + w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2); + if (!_mm256_testz_si256(w0, w0)) { + return false; + } + + _mm256_storeu_si256((__m256i*)(vp + v_n_size), v); + } + } + + ui32 *vp = v_n_scratch; + ui16 *sp = scratch + (y >> 1) * sstr; + ui32 *dp = decoded_data + y * stride; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) { + //process two quads + __m128i vn = _mm_set1_epi32(2); + + __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp)); + inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1)); + + __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size))); + U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1)); + + __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, &magsgn, p, vn); + row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row)); + _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1)); + + __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp); + w0 = _mm_or_si128(w0, vn); + _mm_storeu_si128((__m128i*)vp, w0); + } + } + } + else { + + // reduce bitplane by 16 because we now have 16 bits instead of 32 + p -= 16; + + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We may go beyond the last entry by up to 8 entries. + // Therefore we allocate additional 8 entries. + // There are two rows in this structure, the bottom + // row is used to store processed entries. + const int v_n_size = 512 + 16; + ui16 v_n_scratch[v_n_size] = {0}; // 1+ kB + ui32 v_n_scratch_32[v_n_size] = {0}; // 2+ kB + + frwd_struct magsgn; + frwd_init<0xFF>(&magsgn, coded_data, lcup - scup); + + { + ui16 *sp = scratch; + ui16 *vp = v_n_scratch; + ui32 *dp = decoded_data; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) { + ////process four quads + __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp); + __m128i U_q = _mm_srli_epi32(inf_u_q, 16); + __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2)); + if (!_mm_testz_si128(w, w)) { + return false; + } + + __m128i vn = _mm_set1_epi16(2); + __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn); + + w = _mm_cvtsi32_si128(*(unsigned short const*)(vp)); + _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn)); + + __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1)); + __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1)); + + _mm256_storeu_si256((__m256i*)dp, w0); + _mm256_storeu_si256((__m256i*)(dp + stride), w1); + } + } + + for (ui32 y = 2; y < height; y += 2) { + { + // perform 15 - count_leading_zeros(*vp) here + ui16 *vp = v_n_scratch; + ui32 *vp_32 = v_n_scratch_32; + + ui16* sp = scratch + (y >> 1) * sstr; + const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2); + const __m256i avx_31 = _mm256_set1_epi32(31); + const __m256i avx_f0 = _mm256_set1_epi32(0xF0); + const __m256i avx_1 = _mm256_set1_epi32(1); + const __m256i avx_0 = _mm256_setzero_si256(); + + for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) { + __m128i v = _mm_loadu_si128((__m128i*)vp); + __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1)); + v = _mm_or_si128(v, v_p1); + + __m256i v_avx = _mm256_cvtepu16_epi32(v); + v_avx = avx2_lzcnt_epi32(v_avx); + v_avx = _mm256_sub_epi32(avx_31, v_avx); + + __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp); + __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0); + __m256i w0 = _mm256_sub_epi32(gamma, avx_1); + gamma = _mm256_and_si256(gamma, w0); + gamma = _mm256_cmpeq_epi32(gamma, avx_0); + + v_avx = _mm256_andnot_si256(gamma, v_avx); + v_avx = _mm256_max_epi32(v_avx, avx_1); + + inf_u_q = _mm256_srli_epi32(inf_u_q, 16); + v_avx = _mm256_add_epi32(inf_u_q, v_avx); + + w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2); + if (!_mm256_testz_si256(w0, w0)) { + return false; + } + + _mm256_storeu_si256((__m256i*)vp_32, v_avx); + } + } + + ui16 *vp = v_n_scratch; + ui32* vp_32 = v_n_scratch_32; + ui16 *sp = scratch + (y >> 1) * sstr; + ui32 *dp = decoded_data + y * stride; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) { + ////process four quads + __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp); + __m128i U_q = _mm_loadu_si128((__m128i*)vp_32); + + __m128i vn = _mm_set1_epi16(2); + __m256i row = decode_four_quad16(inf_u_q, U_q, &magsgn, p, vn); + + __m128i w = _mm_cvtsi32_si128(*(unsigned short const*)(vp)); + _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn)); + + __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1)); + __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1)); + + _mm256_storeu_si256((__m256i*)dp, w0); + _mm256_storeu_si256((__m256i*)(dp + stride), w1); + } + } + + // increase bitplane back by 16 because we need to process 32 bits + p += 16; + } + + if (num_passes > 1) + { + // We use scratch again, we can divide it into multiple regions + // sigma holds all the significant samples, and it cannot + // be modified after it is set. it will be used during the + // Magnitude Refinement Pass + ui16* const sigma = scratch; + + ui32 mstr = (width + 3u) >> 2; // divide by 4, since each + // ui16 contains 4 columns + mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8 + + // We re-arrange quad significance, where each 4 consecutive + // bits represent one quad, into column significance, where, + // each 4 consequtive bits represent one column of 4 rows + { + ui32 y; + + const __m128i mask_3 = _mm_set1_epi32(0x30); + const __m128i mask_C = _mm_set1_epi32(0xC0); + const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400); + for (y = 0; y < height; y += 4) + { + ui16* sp = scratch + (y >> 1) * sstr; + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2) + { + __m128i s0, s1, u3, uC, t0, t1; + + s0 = _mm_loadu_si128((__m128i*)(sp)); + u3 = _mm_and_si128(s0, mask_3); + u3 = _mm_srli_epi32(u3, 4); + uC = _mm_and_si128(s0, mask_C); + uC = _mm_srli_epi32(uC, 2); + t0 = _mm_or_si128(u3, uC); + + s1 = _mm_loadu_si128((__m128i*)(sp + sstr)); + u3 = _mm_and_si128(s1, mask_3); + u3 = _mm_srli_epi32(u3, 2); + uC = _mm_and_si128(s1, mask_C); + t1 = _mm_or_si128(u3, uC); + + __m128i r = _mm_or_si128(t0, t1); + r = _mm_shuffle_epi8(r, shuffle_mask); + + // _mm_storeu_si32 is not defined, so we use this workaround + _mm_store_ss((float*)dp, _mm_castsi128_ps(r)); + } + dp[0] = 0; // set an extra entry on the right with 0 + } + { + // reset one row after the codeblock + ui16* dp = sigma + (y >> 2) * mstr; + __m128i zero = _mm_setzero_si128(); + for (ui32 x = 0; x < width; x += 32, dp += 8) + _mm_store_si128((__m128i*)dp, zero); + dp[0] = 0; // set an extra entry on the right with 0 + } + } + + // We perform Significance Propagation Pass here + { + // This stores significance information of the previous + // 4 rows. Significance information in this array includes + // all signicant samples in bitplane p - 1; that is, + // significant samples for bitplane p (discovered during the + // cleanup pass and stored in sigma) and samples that have recently + // became significant (during the SPP) in bitplane p-1. + // We store enough for the widest row, containing 1024 columns, + // which is equivalent to 256 of ui16, since each stores 4 columns. + // We add an extra 8 entries, just in case we need more + ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes + + frwd_struct sigprop; + frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 pattern = 0xFFFFu; // a pattern needed samples + if (height - y < 4) { + pattern = 0x7777u; + if (height - y < 3) { + pattern = 0x3333u; + if (height - y < 2) + pattern = 0x1111u; + } + } + + // prev holds sign. info. for the previous quad, together + // with the rows on top of it and below it. + ui32 prev = 0; + ui16 *prev_sig = prev_row_sig; + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui32 *dpp = decoded_data + y * stride; + for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig) + { + // only rows and columns inside the stripe are included + si32 s = (si32)x + 4 - (si32)width; + s = ojph_max(s, 0); + pattern = pattern >> (s * 4); + + // We first find locations that need to be tested (potential + // SPP members); these location will end up in mbr + // In each iteration, we produce 16 bits because cwd can have + // up to 16 bits of significance information, followed by the + // corresponding 16 bits of sign information; therefore, it is + // sufficient to fetch 32 bit data per loop. + + // Althougth we are interested in 16 bits only, we load 32 bits. + // For the 16 bits we are producing, we need the next 4 bits -- + // We need data for at least 5 columns out of 8. + // Therefore loading 32 bits is easier than loading 16 bits + // twice. + ui32 ps = *(ui32*)prev_sig; + ui32 ns = *(ui32*)(cur_sig + mstr); + ui32 u = (ps & 0x88888888) >> 3; // the row on top + if (!stripe_causal) + u |= (ns & 0x11111111) << 3; // the row below + + ui32 cs = *(ui32*)cur_sig; + // vertical integration + ui32 mbr = cs; // this sig. info. + mbr |= (cs & 0x77777777) << 1; //above neighbors + mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors + mbr |= u; + // horizontal integration + ui32 t = mbr; + mbr |= t << 4; // neighbors on the left + mbr |= t >> 4; // neighbors on the right + mbr |= prev >> 12; // significance of previous group + + // remove outside samples, and already significant samples + mbr &= pattern; + mbr &= ~cs; + + // find samples that become significant during the SPP + ui32 new_sig = mbr; + if (new_sig) + { + __m128i cwd_vec = frwd_fetch<0>(&sigprop); + ui32 cwd = (ui32)_mm_extract_epi16(cwd_vec, 0); + + ui32 cnt = 0; + ui32 col_mask = 0xFu; + ui32 inv_sig = ~cs & pattern; + for (int i = 0; i < 16; i += 4, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan one column + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x33u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x76u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xECu << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xC8u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + } + + if (new_sig) + { + cwd |= (ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt); + + // Spread new_sig, such that each bit is in one byte with a + // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1 + __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig); + new_sig_vec = _mm_shuffle_epi8(new_sig_vec, + _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)); + new_sig_vec = _mm_and_si128(new_sig_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + new_sig_vec = _mm_cmpeq_epi8(new_sig_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + + // find cumulative sums + // to find which bit in cwd we should extract + __m128i inc_sum = new_sig_vec; // inclusive scan + inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1 + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8)); + cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8; + // exclusive scan + __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); + + // Spread cwd, such that each bit is in one byte + // with a value of 0 or 1. + cwd_vec = _mm_set1_epi16((si16)cwd); + cwd_vec = _mm_shuffle_epi8(cwd_vec, + _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)); + cwd_vec = _mm_and_si128(cwd_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + cwd_vec = _mm_cmpeq_epi8(cwd_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + cwd_vec = _mm_abs_epi8(cwd_vec); + + // Obtain bit from cwd_vec correspondig to ex_sum + // Basically, collect needed bits from cwd_vec + __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum); + + // load data and set spp coefficients + __m128i m = + _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0); + __m128i val = _mm_set1_epi32(3 << (p - 2)); + ui32 *dp = dpp; + for (int c = 0; c < 4; ++ c) { + __m128i s0, s0_ns, s0_val; + // load coefficients + s0 = _mm_load_si128((__m128i*)dp); + + // epi32 is -1 only for coefficient that + // are changed during the SPP + s0_ns = _mm_shuffle_epi8(new_sig_vec, m); + s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF)); + + // obtain sign for coefficients in SPP + s0_val = _mm_shuffle_epi8(v, m); + s0_val = _mm_slli_epi32(s0_val, 31); + s0_val = _mm_or_si128(s0_val, val); + s0_val = _mm_and_si128(s0_val, s0_ns); + + // update vector + s0 = _mm_or_si128(s0, s0_val); + // store coefficients + _mm_store_si128((__m128i*)dp, s0); + // prepare for next row + dp += stride; + m = _mm_add_epi32(m, _mm_set1_epi32(1)); + } + } + frwd_advance(&sigprop, cnt); + } + + new_sig |= cs; + *prev_sig = (ui16)(new_sig); + + // vertical integration for the new sig. info. + t = new_sig; + new_sig |= (t & 0x7777) << 1; //above neighbors + new_sig |= (t & 0xEEEE) >> 1; //below neighbors + // add sig. info. from the row on top and below + prev = new_sig | u; + // we need only the bits in 0xF000 + prev &= 0xF000; + } + } + } + + // We perform Magnitude Refinement Pass here + if (num_passes > 2) + { + rev_struct magref; + rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui32 *dpp = decoded_data + y * stride; + for (ui32 i = 0; i < width; i += 4, dpp += 4) + { + //Process one entry from sigma array at a time + // Each nibble (4 bits) in the sigma array represents 4 rows, + ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data + ui16 sig = *cur_sig++; // 16 bit that will be processed now + int total_bits = 0; + if (sig) // if any of the 32 bits are set + { + // We work on 4 rows, with 4 samples each, since + // data is 32 bit (4 bytes) + + // spread the 16 bits in sig to 0 or 1 bytes in sig_vec + __m128i sig_vec = _mm_set1_epi16((si16)sig); + sig_vec = _mm_shuffle_epi8(sig_vec, + _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)); + sig_vec = _mm_and_si128(sig_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + sig_vec = _mm_cmpeq_epi8(sig_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + sig_vec = _mm_abs_epi8(sig_vec); + + // find cumulative sums + // to find which bit in cwd we should extract + __m128i inc_sum = sig_vec; // inclusive scan + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4)); + inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8)); + total_bits = _mm_extract_epi16(inc_sum, 7) >> 8; + __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan + + // Spread the 16 bits in cwd to inverted 0 or 1 bytes in + // cwd_vec. Then, convert these to a form suitable + // for coefficient modifications; in particular, a value + // of 0 is presented as binary 11, and a value of 1 is + // represented as binary 01 + __m128i cwd_vec = _mm_set1_epi16((si16)cwd); + cwd_vec = _mm_shuffle_epi8(cwd_vec, + _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)); + cwd_vec = _mm_and_si128(cwd_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + cwd_vec = _mm_cmpeq_epi8(cwd_vec, + _mm_set1_epi64x((si64)0x8040201008040201)); + cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1)); + cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec); + cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1)); + + // load data and insert the mrp bit + __m128i m = + _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0); + ui32 *dp = dpp; + for (int c = 0; c < 4; ++c) { + __m128i s0, s0_sig, s0_idx, s0_val; + // load coefficients + s0 = _mm_load_si128((__m128i*)dp); + // find significant samples in this row + s0_sig = _mm_shuffle_epi8(sig_vec, m); + s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128()); + // get MRP bit index, and MRP pattern + s0_idx = _mm_shuffle_epi8(ex_sum, m); + s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx); + // keep data from significant samples only + s0_val = _mm_andnot_si128(s0_sig, s0_val); + // move mrp bits to correct position, and employ + s0_val = _mm_slli_epi32(s0_val, (si32)p - 2); + s0 = _mm_xor_si128(s0, s0_val); + // store coefficients + _mm_store_si128((__m128i*)dp, s0); + // prepare for next row + dp += stride; + m = _mm_add_epi32(m, _mm_set1_epi32(1)); + } + } + // consume data according to the number of bits set + rev_advance_mrp(&magref, (ui32)total_bits); + } + } + } + } + + return true; + } + } +} From 833e21019fe632292ff1ec2d28ac38125f37c14d Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Mon, 9 Sep 2024 10:15:27 +1000 Subject: [PATCH 04/78] Version bump, and adding action to run tests on PR --- .github/workflows/ccp-workflow.yml | 7 ++++++- src/core/common/ojph_version.h | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index df021b0a..2f5278fc 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -1,7 +1,12 @@ # taken from https://github.com/onqtam/doctest/blob/master/.github/workflows/main.yml name: C/C++ CI -on: push +on: + push: + +on: + pull_request: + jobs: build: diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h index 593d4b7f..16a08437 100644 --- a/src/core/common/ojph_version.h +++ b/src/core/common/ojph_version.h @@ -34,5 +34,5 @@ //***************************************************************************/ #define OPENJPH_VERSION_MAJOR 0 -#define OPENJPH_VERSION_MINOR 15 +#define OPENJPH_VERSION_MINOR 16 #define OPENJPH_VERSION_PATCH 0 From a4b8fa1810d679c76f065feae3ef1ab2643ff663 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Mon, 9 Sep 2024 10:18:19 +1000 Subject: [PATCH 05/78] corrected action to run tests on PR --- .github/workflows/ccp-workflow.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 2f5278fc..68323810 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -2,10 +2,11 @@ name: C/C++ CI on: - push: + push on: pull_request: + types: [opened, reopened] jobs: From 64c50cd9811f5932e4485ef2dd311f559cb08521 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Mon, 9 Sep 2024 10:19:33 +1000 Subject: [PATCH 06/78] another attempt to correct action to run tests on PR --- .github/workflows/ccp-workflow.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 68323810..42a51f7a 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -1,9 +1,7 @@ # taken from https://github.com/onqtam/doctest/blob/master/.github/workflows/main.yml name: C/C++ CI -on: - push - +on: push on: pull_request: types: [opened, reopened] From 91be87daf947e3263934511e2129494dd648095a Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Mon, 9 Sep 2024 10:20:55 +1000 Subject: [PATCH 07/78] another attempt to correct action to run tests on PR --- .github/workflows/ccp-workflow.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 42a51f7a..74526d1d 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -1,8 +1,8 @@ # taken from https://github.com/onqtam/doctest/blob/master/.github/workflows/main.yml name: C/C++ CI -on: push -on: +on: + push pull_request: types: [opened, reopened] From 900f0b6c253b1b25d556cf3d700bb6c3ab7ed0b3 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Mon, 9 Sep 2024 10:23:25 +1000 Subject: [PATCH 08/78] another attempt to correct action to run tests on PR --- .github/workflows/ccp-workflow.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 74526d1d..560e0c5b 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -3,6 +3,7 @@ name: C/C++ CI on: push +on: pull_request: types: [opened, reopened] From 8159ca178b806b0875efb119b4616aba7902f96c Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Mon, 9 Sep 2024 10:25:26 +1000 Subject: [PATCH 09/78] another attempt to correct action to run tests on PR --- .github/workflows/ccp-workflow.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 560e0c5b..da94fd23 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -2,8 +2,7 @@ name: C/C++ CI on: - push -on: + push: pull_request: types: [opened, reopened] From 0f21481f8a4be073704ff2f625742097f3f37a89 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 17 Sep 2024 19:17:01 +1000 Subject: [PATCH 10/78] Added support for NLT marker only, but not tested. --- src/core/codestream/ojph_codestream.cpp | 6 + src/core/codestream/ojph_codestream_local.h | 3 + src/core/codestream/ojph_params.cpp | 251 +++++++++++++++++++- src/core/codestream/ojph_params_local.h | 57 +++++ src/core/codestream/ojph_resolution.cpp | 4 +- src/core/common/ojph_codestream.h | 16 +- src/core/common/ojph_params.h | 40 ++++ src/core/common/ojph_version.h | 2 +- 8 files changed, 363 insertions(+), 16 deletions(-) diff --git a/src/core/codestream/ojph_codestream.cpp b/src/core/codestream/ojph_codestream.cpp index 06f6b567..f2832ac4 100644 --- a/src/core/codestream/ojph_codestream.cpp +++ b/src/core/codestream/ojph_codestream.cpp @@ -84,6 +84,12 @@ namespace ojph { return param_qcd(&state->qcd); } + //////////////////////////////////////////////////////////////////////////// + param_nlt codestream::access_nlt() + { + return param_nlt(&state->nlt); + } + //////////////////////////////////////////////////////////////////////////// void codestream::set_planar(bool planar) { diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h index 8ca8c717..0a95ef8e 100644 --- a/src/core/codestream/ojph_codestream_local.h +++ b/src/core/codestream/ojph_codestream_local.h @@ -96,6 +96,8 @@ namespace ojph { } const param_dfs* access_dfs() { if (dfs.exists()) return &dfs; else return NULL; } + const param_nlt* access_nlt() + { return ≮ } mem_fixed_allocator* get_allocator() { return allocator; } mem_elastic_allocator* get_elastic_alloc() { return elastic_alloc; } outfile_base* get_file() { return outfile; } @@ -161,6 +163,7 @@ namespace ojph { param_cap cap; // extended capabilities param_qcd qcd; // quantization default param_tlm tlm; // tile-part lengths + param_nlt nlt; // non-linearity point transformation private: // this is to handle qcc and coc int used_qcc_fields; diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index b6ada178..bd0471f2 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -372,6 +372,27 @@ namespace ojph { // //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + void param_nlt::set_type3_transformation(ui16 comp_num, bool enable) + { + state->set_type3_transformation(comp_num, enable); + } + + ////////////////////////////////////////////////////////////////////////// + bool param_nlt::get_type3_transformation(ui16 comp_num, ui8& bit_depth, + bool& is_signed) + { + return state->get_type3_transformation(comp_num, bit_depth, is_signed); + } + + //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void comment_exchange::set_string(const char* str) { @@ -611,7 +632,7 @@ namespace ojph { if ((Rsiz & 0x4000) == 0) OJPH_ERROR(0x00050044, "Rsiz bit 14 is not set (this is not a JPH file)"); - if ((Rsiz & 0x8000) != 0 && (Rsiz & 0xF5F) != 0) + if ((Rsiz & 0x8000) != 0 && (Rsiz & 0xD5F) != 0) OJPH_WARN(0x00050001, "Rsiz in SIZ has unimplemented fields"); if (file->read(&Xsiz, 4) != 4) OJPH_ERROR(0x00050045, "error reading SIZ marker"); @@ -1214,6 +1235,220 @@ namespace ojph { OJPH_ERROR(0x000500AA, "wrong Sqcc value in QCC marker"); } + ////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + ////////////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////////////////////////// + void param_nlt::check_validity(const param_siz& siz) + { + if (is_any_enabled() == false) + return; + + bool all_same_bit_depth = true; + bool all_same_signedness = true; + ui32 num_comps = siz.get_num_components(); + + ui32 bit_depth = 0; // unknown yet + bool is_signed = false; // unknown yet + for (ui32 c = 0; c < num_comps; ++c) + { + param_nlt* p = get_comp_object(c); + if (p == NULL || !p->enabled) // comp is not in list or not enabled + { + if (bit_depth == 0) + { // this is the first component which has not type 3 nlt definition + bit_depth = siz.get_bit_depth(c); + is_signed = siz.is_signed(c); + } + else + { // we have seen an undefined component previously + all_same_bit_depth = + all_same_bit_depth && (bit_depth == siz.get_bit_depth(c)); + all_same_signedness = + all_same_signedness && (is_signed != siz.is_signed(c)); + } + } + else + { + p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1); + p->BDnlt |= (ui8)(siz.is_signed(c) ? 0x80 : 0); + } + } + + if (this->enabled) + { + if (bit_depth != 0) // default captures some components + { + this->BDnlt = (ui8)((bit_depth - 1) | (is_signed ? 0x80 : 0)); + if (!all_same_bit_depth || !all_same_signedness) + { + // We cannot use the default for all undefined components, so we + // will keep it and set it to the values of the first undefined + // component, but we will also define that component + + for (ui32 c = 0; c < num_comps; ++c) + { + param_nlt* p = get_comp_object(c); + if (p == NULL) { + // values were defined previously for (p && enabled) + p = add_object(c); + p->enabled = true; + p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1); + p->BDnlt |= (ui8)(siz.is_signed(c) ? 0x80 : 0); + } + } + } + } + else + this->enabled = false; + } + + trim_non_existing_components(num_comps); + } + + ////////////////////////////////////////////////////////////////////////// + void param_nlt::set_type3_transformation(ui16 comp_num, bool enable) + { + param_nlt* p = get_comp_object(comp_num); + if (p == NULL) + p = add_object(comp_num); + p->enabled = enable; + } + + ////////////////////////////////////////////////////////////////////////// + bool param_nlt::get_type3_transformation(ui16 comp_num, ui8& bit_depth, + bool& is_signed) const + { + const param_nlt* p = get_comp_object(comp_num); + p = p ? p : NULL; + if (p->enabled) + { + bit_depth = (p->BDnlt & 0x7F) + 1; + bit_depth = bit_depth <= 38 ? bit_depth : 38; + is_signed = (p->BDnlt & 0x80) == 0x80; + } + return p->enabled; + } + + ////////////////////////////////////////////////////////////////////////// + bool param_nlt::write(outfile_base* file) const + { + if (is_any_enabled() == false) + return true; + + char buf[2]; + bool result = true; + const param_nlt* p = this; + while (p) + { + if (p->enabled) + { + *(ui16*)buf = JP2K_MARKER::NLT; + *(ui16*)buf = swap_byte(*(ui16*)buf); + result &= file->write(&buf, 2) == 2; + *(ui16*)buf = swap_byte(Lnlt); + result &= file->write(&buf, 2) == 2; + *(ui16*)buf = swap_byte(Cnlt); + result &= file->write(&buf, 2) == 2; + result &= file->write(&BDnlt, 1) == 1; + result &= file->write(&Tnlt, 1) == 1; + } + p = p->next; + } + return result; + } + + ////////////////////////////////////////////////////////////////////////// + void param_nlt::read(infile_base* file) + { + ui8 buf[6]; + bool result = true; + + if (result &= file->read(buf, 6) == 6) + OJPH_ERROR(0x00050141, "error reading NLT marker segment"); + + ui16 length = swap_byte(*(ui16*)buf); + if (length != 6 || buf[5] != 3) // wrong length or type + OJPH_ERROR(0x00050142, "Unsupported NLT type %d\n", buf[5]); + + ui16 comp = swap_byte(*(ui16*)(buf + 2)); + param_nlt* p = this; + if (comp != 65535) + { + p = get_comp_object(comp); + if (p == NULL) + p = add_object(comp); + } + p->enabled = true; + p->Cnlt = comp; + p->BDnlt = buf[4]; + } + + ////////////////////////////////////////////////////////////////////////// + param_nlt* param_nlt::get_comp_object(ui32 comp_num) + { + // cast object to constant + const param_nlt* const_p = const_cast(this); + // call using the constant object, then cast to non-const + return const_cast(const_p->get_comp_object(comp_num)); + } + + ////////////////////////////////////////////////////////////////////////// + const param_nlt* param_nlt::get_comp_object(ui32 comp_num) const + { + if (Cnlt == comp_num) + return this; + else { + param_nlt* p = next; + while (p && p->Cnlt != comp_num) + p = p->next; + return p; + } + } + + ////////////////////////////////////////////////////////////////////////// + param_nlt* param_nlt::add_object(ui32 comp_num) + { + assert(Cnlt != comp_num); + param_nlt* p = this; + while (p->next != NULL) { + assert(p->Cnlt != comp_num); + p = p->next; + } + p->next = new param_nlt; + p = p->next; + p->Cnlt = (ui16)comp_num; + p->alloced_next = true; + return p; + } + + ////////////////////////////////////////////////////////////////////////// + bool param_nlt::is_any_enabled() const + { + // check if any field is enabled + const param_nlt* p = this; + while (p && p->enabled == false) + p = p->next; + return (p != NULL); + } + + ////////////////////////////////////////////////////////////////////////// + void param_nlt::trim_non_existing_components(ui32 num_comps) + { + param_nlt* p = this->next; + while (p) { + if (p->enabled == true && p->Cnlt >= num_comps) + p->enabled = false; + p = p->next; + } + } + + ////////////////////////////////////////////////////////////////////////// // // @@ -1239,10 +1474,8 @@ namespace ojph { result &= file->write(&buf, 2) == 2; *(ui32*)buf = swap_byte(Psot); result &= file->write(&buf, 4) == 4; - *(ui8*)buf = TPsot; - result &= file->write(&buf, 1) == 1; - *(ui8*)buf = TNsot; - result &= file->write(&buf, 1) == 1; + result &= file->write(&TPsot, 1) == 1; + result &= file->write(&TNsot, 1) == 1; return result; } @@ -1263,10 +1496,8 @@ namespace ojph { result &= file->write(&buf, 2) == 2; *(ui32*)buf = swap_byte(payload_len + 14); result &= file->write(&buf, 4) == 4; - *(ui8*)buf = TPsot; - result &= file->write(&buf, 1) == 1; - *(ui8*)buf = TNsot; - result &= file->write(&buf, 1) == 1; + result &= file->write(&TPsot, 1) == 1; + result &= file->write(&TNsot, 1) == 1; return result; } @@ -1363,7 +1594,7 @@ namespace ojph { "In any case, this limit means that we have 10922 " "tileparts or more, which is a huge number."); this->num_pairs = num_pairs; - pairs = (Ttlm_Ptlm_pair*)store; + pairs = store; Ltlm = (ui16)(4 + 6 * num_pairs); Ztlm = 0; Stlm = 0x60; diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 1958b8e8..4064116b 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -138,6 +138,7 @@ namespace ojph { COM = 0xFF64, //comment DFS = 0xFF72, //downsampling factor styles ADS = 0xFF73, //arbitrary decomposition styles + NLT = 0xFF76, //non-linearity point transformation ATK = 0xFF79, //arbitrary transformation kernels SOT = 0xFF90, //start of tile-part SOP = 0xFF91, //start of packet @@ -659,6 +660,62 @@ namespace ojph { ui16 comp_idx; }; + /////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + /////////////////////////////////////////////////////////////////////////// + // data structures used by param_nlt + struct param_nlt + { + public: + param_nlt() { + Lnlt = 6; + Cnlt = 65535; // default + BDnlt = 0; + Tnlt = 3; + enabled = false; next = NULL; alloced_next = false; + } + + ~param_nlt() { + if (next && alloced_next) { + delete next; + next = NULL; + } + } + + void check_validity(const param_siz& siz); + void set_type3_transformation(ui16 comp_num, bool enable); + bool get_type3_transformation(ui16 comp_num, ui8& bit_depth, + bool& is_signed) const; + bool write(outfile_base* file) const; + void read(infile_base* file); + + private: + const param_nlt* get_comp_object(ui32 comp_num) const; + param_nlt* get_comp_object(ui32 comp_num); + param_nlt* add_object(ui32 comp_num); + bool is_any_enabled() const; + void trim_non_existing_components(ui32 num_comps); + + private: + ui16 Lnlt; // length of the marker segment excluding marker + ui16 Cnlt; // Component involved in the transformation + ui8 BDnlt; // Decoded image component bit depth parameter + ui8 Tnlt; // Type of non-linearity + bool enabled; // true if this object is used + param_nlt* next; // for chaining NLT markers + bool alloced_next; // true if next was allocated, not just set to an + // existing object + + // The top level param_nlt object is not allocated, but as part of + // codestream, and is used to manage allocated next objects. + // next holds a list of param_nlt objects, which are managed by the top + // param_nlt object. + }; + /////////////////////////////////////////////////////////////////////////// // // diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp index b82a810a..87466e0d 100644 --- a/src/core/codestream/ojph_resolution.cpp +++ b/src/core/codestream/ojph_resolution.cpp @@ -245,8 +245,8 @@ namespace ojph { const param_dfs* dfs = codestream->access_dfs(); if (dfs == NULL) { OJPH_ERROR(0x00070011, "There is a problem with codestream " - "marker segments. COD/COC specifies the use of a DFS marker " - "but there are no DFS markers within the main codestream " + "marker segments. COD/COC specifies the use of a DFS marker " + "but there are no DFS markers within the main codestream " "headers"); } else { diff --git a/src/core/common/ojph_codestream.h b/src/core/common/ojph_codestream.h index 5f6dcdb1..c28096ed 100644 --- a/src/core/common/ojph_codestream.h +++ b/src/core/common/ojph_codestream.h @@ -57,6 +57,7 @@ namespace ojph { class param_siz; class param_cod; class param_qcd; + class param_nlt; class comment_exchange; class mem_fixed_allocator; struct point; @@ -318,7 +319,7 @@ namespace ojph { * @brief Returns the underlying SIZ marker segment object * * @return param_siz This object holds SIZ marker segment information, - * which are related to codestream dimensions, number + * which deals with codestream dimensions, number * of components, bit depth, ... etc. */ param_siz access_siz(); @@ -327,7 +328,7 @@ namespace ojph { * @brief Returns the underlying COD marker segment object * * @return param_cod This object holds COD marker segment information, - * which are related to coding parameters, such as + * which deals with coding parameters, such as * codeblock sizes, progression order, reversible, * ... etc. */ @@ -337,11 +338,20 @@ namespace ojph { * @brief Returns the underlying QCD marker segment object * * @return param_qcd This object holds QCD marker segment information, - * which are related to quantization parameters -- + * which deals with quantization parameters -- * quantization step size for each subband. */ param_qcd access_qcd(); + /** + * @brief Returns the underlying NLT marker segment object + * + * @return param_nlt This object holds NLT marker segment information, + * which deals with non-linearity point transformation + * for each component. + */ + param_nlt access_nlt(); + /** * @brief Query if the codestream extraction is planar or not. * See the documentation for ojph::codestream::set_planar() diff --git a/src/core/common/ojph_params.h b/src/core/common/ojph_params.h index 0dce0cea..5a74160d 100644 --- a/src/core/common/ojph_params.h +++ b/src/core/common/ojph_params.h @@ -52,6 +52,7 @@ namespace ojph { struct param_qcd; struct param_qcc; struct param_cap; + struct param_nlt; class codestream; } @@ -131,6 +132,45 @@ namespace ojph { local::param_qcd* state; }; + /** + * @brief non-linearity point transformation object + * (implements NLT marker segment) + * + */ + class OJPH_EXPORT param_nlt + { + public: + param_nlt(local::param_nlt* p) : state(p) {} + + /** + * @brief enables or disables type 3 nonlinearity for a component + * or the default setting + * + * If you think that you need type 3 nonlinearity for all components, + * call this function with comp_num set to 65535 and enable to true. + * + * @param comp_num: component number, or 65535 for the default setting + * @param enable: true to enable nlt type 3 for this component or the + default setting, false to disable nlt type 3. + */ + void set_type3_transformation(ui16 comp_num, bool enable); + + /** + * @brief get the state (enabled or disabled) of type 3 nonlinearity + * for a component or the default setting + * + * @param comp_num: component number, or 65535 for the default setting + * @param bit_depth: returns the bit depth of the component/default + * @param is_signed: returns true if the component/default is signed + * @return true if enabled or false if not. + */ + bool get_type3_transformation(ui16 comp_num, ui8& bit_depth, + bool& is_signed); + + private: + local::param_nlt* state; + }; + //////////////////////////////////////////////////////////////////////////// class OJPH_EXPORT comment_exchange { diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h index 16a08437..2f3adcc6 100644 --- a/src/core/common/ojph_version.h +++ b/src/core/common/ojph_version.h @@ -34,5 +34,5 @@ //***************************************************************************/ #define OPENJPH_VERSION_MAJOR 0 -#define OPENJPH_VERSION_MINOR 16 +#define OPENJPH_VERSION_MINOR 17 #define OPENJPH_VERSION_PATCH 0 From 5d888b5924f58e1ff287169482fbb3bcdc722046 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 17 Sep 2024 19:18:00 +1000 Subject: [PATCH 11/78] This is for lossless compression. It removes the 1.1f factor. --- src/core/codestream/ojph_params.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index bd0471f2..e17c1d53 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -935,17 +935,16 @@ namespace ojph { B += is_employing_color_transform ? 1 : 0; //1 bit for RCT int s = 0; float bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true); - //we leave some leeway for numerical error by multiplying by 1.1f - ui32 X = (ui32) ceil(log(bibo_l * bibo_l * 1.1f) / M_LN2); + ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2); u8_SPqcd[s++] = (ui8)((B + X) << 3); for (ui32 d = num_decomps; d > 0; --d) { float bibo_l = bibo_gains::get_bibo_gain_l(d, true); float bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true); - X = (ui32) ceil(log(bibo_h * bibo_l * 1.1f) / M_LN2); + X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2); u8_SPqcd[s++] = (ui8)((B + X) << 3); u8_SPqcd[s++] = (ui8)((B + X) << 3); - X = (ui32) ceil(log(bibo_h * bibo_h * 1.1f) / M_LN2); + X = (ui32) ceil(log(bibo_h * bibo_h) / M_LN2); u8_SPqcd[s++] = (ui8)((B + X) << 3); } } From 5f296034f69763704ddd4e19fc47b305e308de37 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 17 Sep 2024 21:24:59 +1000 Subject: [PATCH 12/78] This should complete all code changed for the library. We need to test now. --- src/core/codestream/ojph_codestream_local.cpp | 59 ++++++++++++------- src/core/codestream/ojph_codestream_local.h | 2 +- src/core/codestream/ojph_params.cpp | 2 +- src/core/codestream/ojph_tile.cpp | 46 ++++++++++++--- src/core/codestream/ojph_tile.h | 1 + src/core/transform/ojph_colour.cpp | 36 ++++++++--- src/core/transform/ojph_colour.h | 4 ++ src/core/transform/ojph_colour_avx2.cpp | 21 +++++++ src/core/transform/ojph_colour_local.h | 17 ++++++ src/core/transform/ojph_colour_sse2.cpp | 18 ++++++ src/core/transform/ojph_colour_wasm.cpp | 18 ++++++ 11 files changed, 182 insertions(+), 42 deletions(-) diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp index 82794668..7a114b7f 100644 --- a/src/core/codestream/ojph_codestream_local.cpp +++ b/src/core/codestream/ojph_codestream_local.cpp @@ -550,6 +550,7 @@ namespace ojph { cod.update_atk(atk); qcd.check_validity(siz, cod); cap.check_validity(cod, qcd); + nlt.check_validity(siz); if (profile == OJPH_PN_IMF) check_imf_validity(); else if (profile == OJPH_PN_BROADCAST) @@ -632,6 +633,9 @@ namespace ojph { if (!qcd.write(file)) OJPH_ERROR(0x00030026, "Error writing to file"); + if (!nlt.write(file)) + OJPH_ERROR(0x00030027, "Error writing to file"); + char buf[] = " OpenJPH Ver " OJPH_INT_TO_STRING(OPENJPH_VERSION_MAJOR) "." OJPH_INT_TO_STRING(OPENJPH_VERSION_MINOR) "." @@ -642,23 +646,23 @@ namespace ojph { //1 for General use (IS 8859-15:1999 (Latin) values) *(ui16*)(buf + 4) = swap_byte((ui16)(1)); if (file->write(buf, len) != len) - OJPH_ERROR(0x00030027, "Error writing to file"); + OJPH_ERROR(0x00030028, "Error writing to file"); if (comments != NULL) { for (ui32 i = 0; i < num_comments; ++i) { t = swap_byte(JP2K_MARKER::COM); if (file->write(&t, 2) != 2) - OJPH_ERROR(0x00030028, "Error writing to file"); + OJPH_ERROR(0x00030029, "Error writing to file"); t = swap_byte((ui16)(comments[i].len + 4)); if (file->write(&t, 2) != 2) - OJPH_ERROR(0x00030029, "Error writing to file"); + OJPH_ERROR(0x0003002A, "Error writing to file"); //1 for General use (IS 8859-15:1999 (Latin) values) t = swap_byte(comments[i].Rcom); if (file->write(&t, 2) != 2) - OJPH_ERROR(0x0003002A, "Error writing to file"); - if (file->write(comments[i].data, comments[i].len)!=comments[i].len) OJPH_ERROR(0x0003002B, "Error writing to file"); + if (file->write(comments[i].data, comments[i].len)!=comments[i].len) + OJPH_ERROR(0x0003002C, "Error writing to file"); } } } @@ -728,8 +732,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void codestream::read_headers(infile_base *file) { - ui16 marker_list[19] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC, - RGN, POC, PPM, TLM, PLM, CRG, COM, DFS, ATK, SOT }; + ui16 marker_list[20] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC, + RGN, POC, PPM, TLM, PLM, CRG, COM, DFS, ATK, NLT, SOT }; find_marker(file, marker_list, 1); //find SOC find_marker(file, marker_list + 1, 1); //find SIZ siz.read(file); @@ -737,7 +741,7 @@ namespace ojph { int received_markers = 0; //check that COD, & QCD received while (true) { - marker_idx = find_marker(file, marker_list + 2, 17); + marker_idx = find_marker(file, marker_list + 2, 18); if (marker_idx == 0) cap.read(file); else if (marker_idx == 1) @@ -813,6 +817,8 @@ namespace ojph { else if (marker_idx == 15) atk[2].read(file); else if (marker_idx == 16) + nlt.read(file); + else if (marker_idx == 17) break; else OJPH_ERROR(0x00030051, "File ended before finding a tile segment"); @@ -902,19 +908,20 @@ namespace ojph { } bool sod_found = false; - ui16 other_tile_part_markers[6] = { SOT, POC, PPT, PLT, COM, SOD }; + ui16 other_tile_part_markers[7] = { SOT, POC, PPT, PLT, COM, + NLT, SOD }; while (true) { int marker_idx = 0; int result = 0; - marker_idx = find_marker(infile, other_tile_part_markers + 1, 5); + marker_idx = find_marker(infile, other_tile_part_markers + 1, 6); if (marker_idx == 0) result = skip_marker(infile, "POC", - "POC in a tile is not supported yet", + "POC marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 1) result = skip_marker(infile, "PPT", - "PPT in a tile is not supported yet", + "PPT marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 2) //Skipping PLT marker segment;this should not cause any issues @@ -924,6 +931,10 @@ namespace ojph { result = skip_marker(infile, "COM", NULL, OJPH_MSG_LEVEL::NO_MSG, resilient); else if (marker_idx == 4) + result = skip_marker(infile, "NLT", + "NLT marker in tile is not supported yet", + OJPH_MSG_LEVEL::WARN, resilient); + else if (marker_idx == 5) { sod_found = true; break; @@ -961,40 +972,40 @@ namespace ojph { else { //first tile part bool sod_found = false; - ui16 first_tile_part_markers[11] = { SOT, COD, COC, QCD, QCC, RGN, - POC, PPT, PLT, COM, SOD }; + ui16 first_tile_part_markers[12] = { SOT, COD, COC, QCD, QCC, RGN, + POC, PPT, PLT, COM, NLT, SOD }; while (true) { int marker_idx = 0; int result = 0; - marker_idx = find_marker(infile, first_tile_part_markers+1, 10); + marker_idx = find_marker(infile, first_tile_part_markers+1, 11); if (marker_idx == 0) result = skip_marker(infile, "COD", - "COD in a tile is not supported yet", + "COD marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 1) result = skip_marker(infile, "COC", - "COC in a tile is not supported yet", + "COC marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 2) result = skip_marker(infile, "QCD", - "QCD in a tile is not supported yet", + "QCD marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 3) result = skip_marker(infile, "QCC", - "QCC in a tile is not supported yet", + "QCC marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 4) result = skip_marker(infile, "RGN", - "RGN in a tile is not supported yet", + "RGN marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 5) result = skip_marker(infile, "POC", - "POC in a tile is not supported yet", + "POC marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 6) result = skip_marker(infile, "PPT", - "PPT in a tile is not supported yet", + "PPT marker segment in a tile is not supported yet", OJPH_MSG_LEVEL::WARN, resilient); else if (marker_idx == 7) //Skipping PLT marker segment;this should not cause any issues @@ -1004,6 +1015,10 @@ namespace ojph { result = skip_marker(infile, "COM", NULL, OJPH_MSG_LEVEL::NO_MSG, resilient); else if (marker_idx == 9) + result = skip_marker(infile, "NLT", + "PPT marker segment in a tile is not supported yet", + OJPH_MSG_LEVEL::WARN, resilient); + else if (marker_idx == 10) { sod_found = true; break; diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h index 0a95ef8e..e6930d5f 100644 --- a/src/core/codestream/ojph_codestream_local.h +++ b/src/core/codestream/ojph_codestream_local.h @@ -96,7 +96,7 @@ namespace ojph { } const param_dfs* access_dfs() { if (dfs.exists()) return &dfs; else return NULL; } - const param_nlt* access_nlt() + const param_nlt* get_nlt() { return ≮ } mem_fixed_allocator* get_allocator() { return allocator; } mem_elastic_allocator* get_elastic_alloc() { return elastic_alloc; } diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index e17c1d53..f75bcd42 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -1324,7 +1324,7 @@ namespace ojph { bool& is_signed) const { const param_nlt* p = get_comp_object(comp_num); - p = p ? p : NULL; + p = p ? p : this; if (p->enabled) { bit_depth = (p->BDnlt & 0x7F) + 1; diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 3be907d4..29377e73 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -67,6 +67,7 @@ namespace ojph { allocator->pre_alloc_obj(num_comps); //for line_offsets allocator->pre_alloc_obj(num_comps); //for num_bits allocator->pre_alloc_obj(num_comps); //for is_signed + allocator->pre_alloc_obj(num_comps); //for nlt_type3 allocator->pre_alloc_obj(num_comps); //for cur_line ui32 tilepart_div = codestream->get_tilepart_div(); @@ -142,6 +143,7 @@ namespace ojph { //allocate tiles_comp const param_siz *szp = codestream->get_siz(); + const param_nlt *nlp = codestream->get_nlt(); this->num_bytes = 0; num_comps = szp->get_num_components(); @@ -152,6 +154,7 @@ namespace ojph { line_offsets = allocator->post_alloc_obj(num_comps); num_bits = allocator->post_alloc_obj(num_comps); is_signed = allocator->post_alloc_obj(num_comps); + nlt_type3 = allocator->post_alloc_obj(num_comps); cur_line = allocator->post_alloc_obj(num_comps); profile = codestream->get_profile(); @@ -176,6 +179,8 @@ namespace ojph { ui32 width = 0; for (ui32 i = 0; i < num_comps; ++i) { + ui8 bd; bool is; // used for nlt_type3 + point downsamp = szp->get_downsampling(i); point recon_downsamp = szp->get_recon_downsampling(i); @@ -205,6 +210,13 @@ namespace ojph { num_bits[i] = szp->get_bit_depth(i); is_signed[i] = szp->is_signed(i); + nlt_type3[i] = nlp->get_type3_transformation(i, bd, is); + if (nlt_type3[i] == true && (bd != num_bits[i] || is != is_signed[i])) + OJPH_ERROR(0x000300A1, "Mismatch between Ssiz (bit_depth = %d, " + "is_signed = %s) from SIZ marker segment, and BDnlt " + "(bit_depth = %d, is_signed = %s) from NLT marker segment, " + "for component %d",i, num_bits[i], + is_signed[i] ? "True" : "False", bd, is ? "True" : "False"); cur_line[i] = 0; } @@ -250,8 +262,12 @@ namespace ojph { int shift = 1 << (num_bits[comp_num] - 1); const si32 *sp = line->i32 + line_offsets[comp_num]; si32* dp = tc->i32; - if (is_signed[comp_num]) - memcpy(dp, sp, comp_width * sizeof(si32)); + if (is_signed[comp_num]) { + if (nlt_type3[comp_num]) + cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); + else + memcpy(dp, sp, comp_width * sizeof(si32)); + } else cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); } @@ -269,14 +285,18 @@ namespace ojph { } else { + int shift = 1 << (num_bits[comp_num] - 1); ui32 comp_width = comp_rects[comp_num].siz.w; if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); const si32 *sp = line->i32 + line_offsets[comp_num]; si32 *dp = lines[comp_num].i32; - if (is_signed[comp_num]) - memcpy(dp, sp, comp_width * sizeof(si32)); + if (is_signed[comp_num]) { + if (nlt_type3[comp_num]) + cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); + else + memcpy(dp, sp, comp_width * sizeof(si32)); + } else cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); if (comp_num == 2) @@ -333,8 +353,12 @@ namespace ojph { int shift = 1 << (num_bits[comp_num] - 1); const si32 *sp = src_line->i32; si32* dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) - memcpy(dp, sp, comp_width * sizeof(si32)); + if (is_signed[comp_num]) { + if (nlt_type3[comp_num]) + cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); + else + memcpy(dp, sp, comp_width * sizeof(si32)); + } else cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width); } @@ -373,8 +397,12 @@ namespace ojph { else sp = comps[comp_num].pull_line()->i32; si32* dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) - memcpy(dp, sp, comp_width * sizeof(si32)); + if (is_signed[comp_num]) { + if (nlt_type3[comp_num]) + cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); + else + memcpy(dp, sp, comp_width * sizeof(si32)); + } else cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width); } diff --git a/src/core/codestream/ojph_tile.h b/src/core/codestream/ojph_tile.h index 056c7c94..4b542421 100644 --- a/src/core/codestream/ojph_tile.h +++ b/src/core/codestream/ojph_tile.h @@ -89,6 +89,7 @@ namespace ojph { ui32 *num_bits; bool *is_signed; ui32 *cur_line; + bool *nlt_type3; int prog_order; private: diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index fb42a7d7..34161d43 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -49,43 +49,47 @@ namespace ojph { void (*cnvrt_si32_to_si32_shftd) (const si32 *sp, si32 *dp, int shift, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + void (*cnvrt_si32_to_si32_nlt_type3) + (const si32* sp, si32* dp, int shift, ui32 width) = NULL; + + ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float_shftd) (const si32 *sp, float *dp, float mul, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float) (const si32 *sp, float *dp, float mul, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*cnvrt_float_to_si32_shftd) (const float *sp, si32 *dp, float mul, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*cnvrt_float_to_si32) (const float *sp, si32 *dp, float mul, ui32 width) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*rct_forward) (const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*rct_backward) (const si32 *y, const si32 *cb, const si32 *cr, si32 *r, si32 *g, si32 *b, ui32 repeat) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*ict_forward) (const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// void (*ict_backward) (const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat) = NULL; - //////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// static bool colour_transform_functions_initialized = false; ////////////////////////////////////////////////////////////////////////// @@ -97,6 +101,7 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) cnvrt_si32_to_si32_shftd = gen_cnvrt_si32_to_si32_shftd; + cnvrt_si32_to_si32_nlt_type3 = gen_cnvrt_si32_to_si32_nlt_type3; cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd; cnvrt_si32_to_float = gen_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd; @@ -128,6 +133,7 @@ namespace ojph { cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd; cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32; cnvrt_si32_to_si32_shftd = sse2_cnvrt_si32_to_si32_shftd; + cnvrt_si32_to_si32_nlt_type3 = sse2_cnvrt_si32_to_si32_nlt_type3; rct_forward = sse2_rct_forward; rct_backward = sse2_rct_backward; } @@ -149,6 +155,7 @@ namespace ojph { if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { cnvrt_si32_to_si32_shftd = avx2_cnvrt_si32_to_si32_shftd; + cnvrt_si32_to_si32_nlt_type3 = avx2_cnvrt_si32_to_si32_nlt_type3; rct_forward = avx2_rct_forward; rct_backward = avx2_rct_backward; } @@ -162,6 +169,7 @@ namespace ojph { #else // OJPH_ENABLE_WASM_SIMD cnvrt_si32_to_si32_shftd = wasm_cnvrt_si32_to_si32_shftd; + cnvrt_si32_to_si32_nlt_type3 = wasm_cnvrt_si32_to_si32_nlt_type3; cnvrt_si32_to_float_shftd = wasm_cnvrt_si32_to_float_shftd; cnvrt_si32_to_float = wasm_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd; @@ -200,6 +208,16 @@ namespace ojph { *dp++ = *sp++ + shift; } + ////////////////////////////////////////////////////////////////////////// + void gen_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, + int shift, ui32 width) + { + for (ui32 i = width; i > 0; --i) { + const si32 v = *sp++; + *dp++ = v > 0 ? v : (- v - shift); + } + } + ////////////////////////////////////////////////////////////////////////// void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width) diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h index 212848b5..52df3123 100644 --- a/src/core/transform/ojph_colour.h +++ b/src/core/transform/ojph_colour.h @@ -49,6 +49,10 @@ namespace ojph { extern void (*cnvrt_si32_to_si32_shftd) (const si32 *sp, si32 *dp, int shift, ui32 width); + //////////////////////////////////////////////////////////////////////////// + extern void (*cnvrt_si32_to_si32_nlt_type3) + (const si32 *sp, si32 *dp, int shift, ui32 width); + //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_si32_to_float_shftd) (const si32 *sp, float *dp, float mul, ui32 width); diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 60e20d6f..14e5a35d 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -59,6 +59,27 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void avx2_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, + int shift, ui32 width) + { + __m256i sh = _mm256_set1_epi32(-shift); + __m256i zero = _mm256_setzero_si256(); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + __m256i s = _mm256_loadu_si256((__m256i*)sp); + __m256i c = _mm256_cmpgt_epi32(s, zero); // 0xFFFFFFFF for +ve value + __m256i z = _mm256_cmpeq_epi32(s, zero); // 0xFFFFFFFF for 0 + c = _mm256_or_si256(c, z); // 0xFFFFFFFF for +ve and 0 + + __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value + v_m_sh = _mm256_andnot_si256(c, v_m_sh); // keep only - shift - value + s = _mm256_and_si256(c, s); // keep only +ve or 0 + s = _mm256_or_si256(s, v_m_sh); // combine + _mm256_storeu_si256((__m256i*)dp, s); + } + } + ////////////////////////////////////////////////////////////////////////// void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat) diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index 6ddf8900..ae5eba1b 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -68,6 +68,10 @@ namespace ojph { void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void gen_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, + int shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width); @@ -160,6 +164,11 @@ namespace ojph { void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void sse2_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, + int shift, ui32 width); + + ////////////////////////////////////////////////////////////////////////// void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat); @@ -212,6 +221,10 @@ namespace ojph { void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void avx2_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, + int shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat); @@ -248,6 +261,10 @@ namespace ojph { void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void wasm_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, + int shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat); diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 4a3cb145..c50c091e 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -94,6 +94,24 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void sse2_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, + int shift, ui32 width) + { + __m128i sh = _mm_set1_epi32(-shift); + __m128i zero = _mm_setzero_si128(); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + __m128i s = _mm_loadu_si128((__m128i*)sp); + __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value + __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + s = _mm_andnot_si128(c, s); // keep only +ve or 0 + s = _mm_or_si128(s, v_m_sh); // combine + _mm_storeu_si128((__m128i*)dp, s); + } + } + ////////////////////////////////////////////////////////////////////////// void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat) diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 632a6454..bc25d426 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -122,6 +122,24 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void wasm_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, + int shift, ui32 width) + { + v128_t sh = wasm_f32x4_splat(-shift); + v128_t zero = wasm_f32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + v128_t s = wasm_v128_load(sp); + v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value + v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value + v_m_sh = wasm_v128_and(v_m_sh, c); // keep only - shift - value + s = wasm_v128_andnot(s, c); // keep only +ve or 0 + s = wasm_v128_or(s, v_m_sh); // combine + wasm_v128_store(dp, s); + } + } + ////////////////////////////////////////////////////////////////////////// void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b, si32 *y, si32 *cb, si32 *cr, ui32 repeat) From 517e1be033cf17d0f8cb8f9685ff24c3c03d0086 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 17 Sep 2024 21:31:41 +1000 Subject: [PATCH 13/78] Getting rid of some of the warnings --- src/core/codestream/ojph_params.cpp | 12 ++++++------ src/core/codestream/ojph_params_local.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index f75bcd42..da91a54d 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -1275,7 +1275,7 @@ namespace ojph { else { p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1); - p->BDnlt |= (ui8)(siz.is_signed(c) ? 0x80 : 0); + p->BDnlt |= (ui8)(siz.is_signed(c) ? 0x80 : (ui8)0); } } @@ -1283,7 +1283,7 @@ namespace ojph { { if (bit_depth != 0) // default captures some components { - this->BDnlt = (ui8)((bit_depth - 1) | (is_signed ? 0x80 : 0)); + this->BDnlt = (ui8)((bit_depth - 1) | (is_signed ? 0x80 : (ui8)0)); if (!all_same_bit_depth || !all_same_signedness) { // We cannot use the default for all undefined components, so we @@ -1298,7 +1298,7 @@ namespace ojph { p = add_object(c); p->enabled = true; p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1); - p->BDnlt |= (ui8)(siz.is_signed(c) ? 0x80 : 0); + p->BDnlt |= siz.is_signed(c) ? 0x80 : (ui8)0; } } } @@ -1311,7 +1311,7 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void param_nlt::set_type3_transformation(ui16 comp_num, bool enable) + void param_nlt::set_type3_transformation(ui32 comp_num, bool enable) { param_nlt* p = get_comp_object(comp_num); if (p == NULL) @@ -1320,14 +1320,14 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - bool param_nlt::get_type3_transformation(ui16 comp_num, ui8& bit_depth, + bool param_nlt::get_type3_transformation(ui32 comp_num, ui8& bit_depth, bool& is_signed) const { const param_nlt* p = get_comp_object(comp_num); p = p ? p : this; if (p->enabled) { - bit_depth = (p->BDnlt & 0x7F) + 1; + bit_depth = (p->BDnlt & 0x7F) + (ui8)1; bit_depth = bit_depth <= 38 ? bit_depth : 38; is_signed = (p->BDnlt & 0x80) == 0x80; } diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 4064116b..fa2f6906 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -687,8 +687,8 @@ namespace ojph { } void check_validity(const param_siz& siz); - void set_type3_transformation(ui16 comp_num, bool enable); - bool get_type3_transformation(ui16 comp_num, ui8& bit_depth, + void set_type3_transformation(ui32 comp_num, bool enable); + bool get_type3_transformation(ui32 comp_num, ui8& bit_depth, bool& is_signed) const; bool write(outfile_base* file) const; void read(infile_base* file); From 2626fe244ac9f07939f67891b7b0d7063caaa1e8 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 17 Sep 2024 21:34:51 +1000 Subject: [PATCH 14/78] 2nd attempt to get rid of warnings. --- src/core/codestream/ojph_params.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index da91a54d..390b6e45 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -1275,7 +1275,7 @@ namespace ojph { else { p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1); - p->BDnlt |= (ui8)(siz.is_signed(c) ? 0x80 : (ui8)0); + p->BDnlt = (ui8)(p->BDnlt | (siz.is_signed(c) ? 0x80 : 0)); } } @@ -1298,7 +1298,7 @@ namespace ojph { p = add_object(c); p->enabled = true; p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1); - p->BDnlt |= siz.is_signed(c) ? 0x80 : (ui8)0; + p->BDnlt = (ui8)(p->BDnlt | (siz.is_signed(c) ? 0x80 : 0)); } } } @@ -1327,7 +1327,7 @@ namespace ojph { p = p ? p : this; if (p->enabled) { - bit_depth = (p->BDnlt & 0x7F) + (ui8)1; + bit_depth = (ui8)((p->BDnlt & 0x7F) + 1); bit_depth = bit_depth <= 38 ? bit_depth : 38; is_signed = (p->BDnlt & 0x80) == 0x80; } From fcd652c46c16b53d0bcdd8a41b824f9a4e68d2a9 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 17 Sep 2024 21:39:45 +1000 Subject: [PATCH 15/78] Small bug fix for wasm --- src/core/transform/ojph_colour_wasm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index bc25d426..e174d0b1 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -126,8 +126,8 @@ namespace ojph { void wasm_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, int shift, ui32 width) { - v128_t sh = wasm_f32x4_splat(-shift); - v128_t zero = wasm_f32x4_splat(0); + v128_t sh = wasm_i32x4_splat(-shift); + v128_t zero = wasm_i32x4_splat(0); for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) { v128_t s = wasm_v128_load(sp); From 086a5a54eff0a97ce001f31fa4e7308ab6e214af Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 17 Sep 2024 22:24:47 +1000 Subject: [PATCH 16/78] A bug fix, and addressing the float to double type conversion. --- src/core/codestream/ojph_params.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index 390b6e45..f250c130 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -934,13 +934,13 @@ namespace ojph { ui32 B = bit_depth; B += is_employing_color_transform ? 1 : 0; //1 bit for RCT int s = 0; - float bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true); + double bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true); ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2); u8_SPqcd[s++] = (ui8)((B + X) << 3); for (ui32 d = num_decomps; d > 0; --d) { - float bibo_l = bibo_gains::get_bibo_gain_l(d, true); - float bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true); + double bibo_l = bibo_gains::get_bibo_gain_l(d, true); + double bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true); X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2); u8_SPqcd[s++] = (ui8)((B + X) << 3); u8_SPqcd[s++] = (ui8)((B + X) << 3); @@ -1366,9 +1366,8 @@ namespace ojph { void param_nlt::read(infile_base* file) { ui8 buf[6]; - bool result = true; - if (result &= file->read(buf, 6) == 6) + if (file->read(buf, 6) != 6) OJPH_ERROR(0x00050141, "error reading NLT marker segment"); ui16 length = swap_byte(*(ui16*)buf); From 5921b01a9949b767fd270822f7621a6fa206c7ea Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Wed, 18 Sep 2024 20:39:54 +1000 Subject: [PATCH 17/78] Added support for .pfm in ojph_compress and ojph_expand -- not happy with it, but it is good for testing. --- src/apps/common/ojph_img_io.h | 112 ++++++- src/apps/ojph_compress/ojph_compress.cpp | 69 +++++ src/apps/ojph_expand/ojph_expand.cpp | 27 ++ src/apps/others/ojph_img_io.cpp | 376 ++++++++++++++++++----- src/core/codestream/ojph_params.cpp | 14 +- src/core/codestream/ojph_params_local.h | 19 +- src/core/codestream/ojph_tile.cpp | 2 +- src/core/common/ojph_params.h | 6 +- 8 files changed, 541 insertions(+), 84 deletions(-) diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h index 8e41493d..7fddf5ba 100644 --- a/src/apps/common/ojph_img_io.h +++ b/src/apps/common/ojph_img_io.h @@ -135,7 +135,7 @@ namespace ojph { ui32 cur_line; si64 start_of_data; - int planar; + bool planar; ui32 bit_depth[3]; bool is_signed[3]; point subsampling[3]; @@ -446,6 +446,67 @@ namespace ojph { size_t buffer_size; }; + //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + class pfm_in : public image_in_base + { + public: + pfm_in(mem_fixed_allocator *p = NULL) + { + fh = 0; + fname = NULL; + alloc_p = p; + temp_buf = NULL; + temp_buf_byte_size = 0; + bit_depth = 32; + scale = 0.0f; + little_endian = true; + width = height = num_comps = 0; + + cur_line = 0; + start_of_data = 0; + } + virtual ~pfm_in() + { + close(); + if (alloc_p == NULL && temp_buf) + free(temp_buf); + } + + void open(const char* filename); + void finalize_alloc(); + void configure(ui32 bit_depth) { + assert(bit_depth > 0 && bit_depth <= 32); + this->bit_depth = bit_depth; + } + virtual ui32 read(const line_buf* line, ui32 comp_num); + void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; } + + size get_size() { assert(fh); return size(width, height); } + ui32 get_width() { assert(fh); return width; } + ui32 get_height() { assert(fh); return height; } + ui32 get_num_components() { assert(fh); return num_comps; } + + private: + FILE *fh; + const char *fname; + mem_fixed_allocator *alloc_p; + float *temp_buf; + ui32 temp_buf_byte_size; + ui32 bit_depth; // this truncates data to bit_depth in the LSB + float scale; + bool little_endian; + ui32 width, height, num_comps; + ui32 cur_line; + si64 start_of_data; + }; + + //////////////////////////////////////////////////////////////////////////// // Accelerators (defined in ojph_img_io_*) typedef void (*conversion_fun)(const line_buf *ln0, const line_buf *ln1, @@ -703,6 +764,55 @@ namespace ojph { ui8* buffer; ui32 buffer_size; }; + + //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + class pfm_out : public image_out_base + { + public: + pfm_out() + { + fh = NULL; + fname = NULL; + buffer = NULL; + buffer_size = 0; + width = height = num_components = 0; + scale = -1.0f; + bit_depth = 32; + cur_line = 0; + start_of_data = 0; + } + virtual ~pfm_out() + { + close(); + if (buffer) + free(buffer); + } + + void open(char* filename); + void configure(ui32 width, ui32 height, ui32 num_components, + float scale, ui32 bit_depth); + virtual ui32 write(const line_buf* line, ui32 comp_num); + virtual void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; } + + private: + FILE *fh; + const char *fname; + float* buffer; + ui32 buffer_size; + ui32 width, height, num_components; + float scale; + ui32 bit_depth; + ui32 cur_line; + si64 start_of_data; + }; + + } #endif // !OJPH_IMG_IO_H diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index 0c4aa0e6..e1a67029 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -611,6 +611,7 @@ int main(int argc, char * argv[]) { ojph::codestream codestream; ojph::ppm_in ppm; + ojph::pfm_in pfm; ojph::yuv_in yuv; ojph::raw_in raw; ojph::dpx_in dpx; @@ -736,6 +737,74 @@ int main(int argc, char * argv[]) { base = &ppm; } + else if (is_matching(".pfm", v)) + { + pfm.open(input_filename); + ojph::param_siz siz = codestream.access_siz(); + siz.set_image_extent(ojph::point(image_offset.x + pfm.get_width(), + image_offset.y + pfm.get_height())); + ojph::ui32 num_comps = pfm.get_num_components(); + assert(num_comps == 1 || num_comps == 3); + siz.set_num_components(num_comps); + + pfm.configure(bit_depth[0]); + for (ojph::ui32 c = 0; c < num_comps; ++c) { + ojph::ui32 bd = 32; + if (bit_depth[c] != 0) + bd = bit_depth[c]; + bool is = false; + if (is_signed[c] != -1) + is = is_signed[c] != 0; + ojph::point ds(1, 1); + siz.set_component(c, ds, bd, is); + } + siz.set_image_offset(image_offset); + siz.set_tile_size(tile_size); + siz.set_tile_offset(tile_offset); + + ojph::param_cod cod = codestream.access_cod(); + cod.set_num_decomposition(num_decompositions); + cod.set_block_dims(block_size.w, block_size.h); + if (num_precincts != -1) + cod.set_precinct_size(num_precincts, precinct_size); + cod.set_progression_order(prog_order); + if (employ_color_transform == -1) + cod.set_color_transform(true); + else + cod.set_color_transform(employ_color_transform == 1); + cod.set_reversible(reversible); + if (!reversible && quantization_step != -1.0f) + codestream.access_qcd().set_irrev_quant(quantization_step); + + ojph::param_nlt nlt = codestream.access_nlt(); + if (reversible) + nlt.set_type3_transformation(ojph::param_nlt::ALL_COMPS, true); + else + OJPH_ERROR(0x01000091, "The support for pfm image is not " + "complete; I need to figure how to modify the interface " + "to better support the exchange of floating point data. " + "Feeding float point data is not supported yet, unless it " + "is for lossless compression."); + + codestream.set_planar(false); + if (profile_string[0] != '\0') + codestream.set_profile(profile_string); + codestream.set_tilepart_divisions(tileparts_at_resolutions, + tileparts_at_components); + codestream.request_tlm_marker(tlm_marker); + + if (dims.w != 0 || dims.h != 0) + OJPH_WARN(0x01000092, + "-dims option is not needed and was not used\n"); + if (num_components != 0) + OJPH_WARN(0x01000093, + "-num_comps is not needed and was not used\n"); + if (comp_downsampling[0].x != 0 || comp_downsampling[0].y != 0) + OJPH_WARN(0x01000094, + "-downsamp is not needed and was not used\n"); + + base = &pfm; + } #ifdef OJPH_ENABLE_TIFF_SUPPORT else if (is_matching(".tif", v) || is_matching(".tiff", v)) { diff --git a/src/apps/ojph_expand/ojph_expand.cpp b/src/apps/ojph_expand/ojph_expand.cpp index 7d6f3d54..2adb535d 100644 --- a/src/apps/ojph_expand/ojph_expand.cpp +++ b/src/apps/ojph_expand/ojph_expand.cpp @@ -213,6 +213,7 @@ int main(int argc, char *argv[]) { ojph::codestream codestream; ojph::ppm_out ppm; + ojph::pfm_out pfm; #ifdef OJPH_ENABLE_TIFF_SUPPORT ojph::tif_out tif; #endif /* OJPH_ENABLE_TIFF_SUPPORT */ @@ -266,6 +267,32 @@ int main(int argc, char *argv[]) { ppm.open(output_filename); base = &ppm; } + else if (is_matching(".pfm", v)) + { + codestream.set_planar(false); + ojph::param_siz siz = codestream.access_siz(); + + ojph::ui32 num_comps = siz.get_num_components(); + if (num_comps != 3 && num_comps != 1) + OJPH_ERROR(0x0200000C, + "The file has %d color components; this cannot be saved to" + " a .pfm file\n", num_comps); + bool all_same = true; + ojph::point p = siz.get_downsampling(0); + for (ojph::ui32 i = 1; i < siz.get_num_components(); ++i) + { + ojph::point p1 = siz.get_downsampling(i); + all_same = all_same && (p1.x == p.x) && (p1.y == p.y); + } + if (!all_same) + OJPH_ERROR(0x0200000D, + "To save an image to ppm, all the components must have the " + "same downsampling ratio\n"); + pfm.configure(siz.get_recon_width(0), siz.get_recon_height(0), + siz.get_num_components(), -1.0f, siz.get_bit_depth(0)); + pfm.open(output_filename); + base = &pfm; + } #ifdef OJPH_ENABLE_TIFF_SUPPORT else if (is_matching(".tif", v) || is_matching(".tiff", v)) { diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index 82bbe105..93dd4534 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -247,7 +247,7 @@ namespace ojph { assert(fh == 0); fh = fopen(filename, "rb"); if (fh == 0) - OJPH_ERROR(0x030000001, "Unable to open file %s", filename); + OJPH_ERROR(0x03000001, "Unable to open file %s", filename); fname = filename; // read magic number @@ -255,27 +255,27 @@ namespace ojph { if (fread(t, 1, 2, fh) != 2) { close(); - OJPH_ERROR(0x030000002, "Error reading file %s", filename); + OJPH_ERROR(0x03000002, "Error reading file %s", filename); } // check magic number if (t[0] != 'P' || (t[1] != '5' && t[1] != '6')) { close(); - OJPH_ERROR(0x030000003, "unknown file type for file %s", filename); + OJPH_ERROR(0x03000003, "unknown file type for file %s", filename); } size_t len = strlen(filename); if (t[1] == '5' && strncmp(filename + len - 4, ".pgm", 4) != 0) { close(); - OJPH_ERROR(0x030000004, "wrong file extension, a file with " + OJPH_ERROR(0x03000004, "wrong file extension, a file with " "keyword P5 must have a .pgm extension for file %s", filename); } if (t[1] == '6' && strncmp(filename + len - 4, ".ppm", 4) != 0) { close(); - OJPH_ERROR(0x030000005, "wrong file extension, a file with keyword P6 " + OJPH_ERROR(0x03000005, "wrong file extension, a file with keyword P6 " "must have a .ppm extension for file %s", filename); } @@ -287,7 +287,7 @@ namespace ojph { if (fscanf(fh, "%d %d %d", &width, &height, &max_val) != 3) { close(); - OJPH_ERROR(0x030000006, "error in file format for file %s", filename); + OJPH_ERROR(0x03000006, "error in file format for file %s", filename); } num_ele_per_line = num_comps * width; bytes_per_sample = max_val > 255 ? 2 : 1; @@ -309,7 +309,7 @@ namespace ojph { temp_buf = malloc(temp_buf_byte_size); if (temp_buf == NULL) { // failed to allocate memory if (t) free(t); // the original buffer is still valid - OJPH_ERROR(0x030000007, "error allocating memory"); + OJPH_ERROR(0x03000007, "error allocating memory"); } } else @@ -347,7 +347,7 @@ namespace ojph { if (result != num_ele_per_line) { close(); - OJPH_ERROR(0x030000011, "not enough data in file %s", fname); + OJPH_ERROR(0x03000011, "not enough data in file %s", fname); } if (++cur_line >= height) { @@ -394,17 +394,17 @@ namespace ojph { if (strncmp(".ppm", filename + len - 4, 4) == 0) { filename[len - 2] = 'g'; - OJPH_WARN(0x03000001, "file was renamed %s\n", filename); + OJPH_WARN(0x03000021, "file was renamed %s\n", filename); } if (strncmp(".PPM", filename + len - 4, 4) == 0) { filename[len - 2] = 'G'; - OJPH_WARN(0x03000002, "file was renamed %s\n", filename); + OJPH_WARN(0x03000022, "file was renamed %s\n", filename); } } fh = fopen(filename, "wb"); if (fh == NULL) - OJPH_ERROR(0x030000021, + OJPH_ERROR(0x03000023, "unable to open file %s for writing", filename); fprintf(fh, "P5\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1); @@ -419,22 +419,22 @@ namespace ojph { if (strncmp(".pgm", filename + len - 4, 4) == 0) { filename[len - 2] = 'p'; - OJPH_WARN(0x03000003, "file was renamed %s\n", filename); + OJPH_WARN(0x03000024, "file was renamed %s\n", filename); } if (strncmp(".PGM", filename + len - 4, 4) == 0) { filename[len - 2] = 'P'; - OJPH_WARN(0x03000004, "file was renamed %s\n", filename); + OJPH_WARN(0x03000025, "file was renamed %s\n", filename); } } fh = fopen(filename, "wb"); if (fh == NULL) - OJPH_ERROR(0x030000022, + OJPH_ERROR(0x03000026, "unable to open file %s for writing", filename); int result = //the number of written characters fprintf(fh, "P6\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1); if (result == 0) - OJPH_ERROR(0x030000023, "error writing to file %s", filename); + OJPH_ERROR(0x03000027, "error writing to file %s", filename); buffer_size = width * num_components * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } @@ -448,7 +448,7 @@ namespace ojph { { assert(fh == NULL); //configure before opening if (num_components != 1 && num_components != 3) - OJPH_ERROR(0x030000031, + OJPH_ERROR(0x03000031, "ppm supports 3 colour components, while pgm supports 1"); this->width = width; this->height = height; @@ -530,12 +530,244 @@ namespace ojph { size_t result = fwrite(buffer, bytes_per_sample, samples_per_line, fh); if (result != samples_per_line) - OJPH_ERROR(0x030000042, "error writing to file %s", fname); + OJPH_ERROR(0x03000041, "error writing to file %s", fname); } return 0; } //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + + ///////////////////////////////////////////////////////////////////////////// + void pfm_in::open(const char *filename) + { + assert(fh == 0); + fh = fopen(filename, "rb"); + if (fh == 0) + OJPH_ERROR(0x03000051, "Unable to open file %s", filename); + fname = filename; + + // read magic number + char t[2]; + if (fread(t, 1, 2, fh) != 2) + { + close(); + OJPH_ERROR(0x03000052, "Error reading file %s", filename); + } + + // check magic number + if (t[0] != 'P' || (t[1] != 'F' && t[1] != 'f')) + { + close(); + OJPH_ERROR(0x03000053, "Unknown file type for file %s", filename); + } + + // set number of components based on file-type + num_comps = t[1] == 'f' ? 1 : 3; + eat_white_spaces(fh); + + // read width, height and max value in header + if (fscanf(fh, "%d %d", &width, &height) != 2) + { + close(); + OJPH_ERROR(0x03000054, + "Error reading width and height in file %s", filename); + } + eat_white_spaces(fh); + + // little or big-endian + if (fscanf(fh, "%f", &scale) != 1) + { + close(); + OJPH_ERROR(0x03000055, "Error reading scale in file %s", filename); + } + little_endian = scale < 0.0f; + scale = std::abs(scale); + + fgetc(fh); + start_of_data = ojph_ftell(fh); + + // alloc. linebuffer to hold a line of image data, if more than 1 comp. + if (temp_buf_byte_size < num_comps * width * sizeof(float)) + { + if (alloc_p == NULL) + { + temp_buf_byte_size = num_comps * width * sizeof(float); + void* t = temp_buf; + if (temp_buf) + temp_buf = (float*)realloc(temp_buf, temp_buf_byte_size); + else + temp_buf = (float*)malloc(temp_buf_byte_size); + if (temp_buf == NULL) { // failed to allocate memory + if (t) free(t); // the original buffer is still valid + OJPH_ERROR(0x03000056, "Error allocating memory"); + } + } + else + { + assert(temp_buf_byte_size == 0); //cannot reallocate the buffer + temp_buf_byte_size = num_comps * width * sizeof(float); + alloc_p->pre_alloc_data(temp_buf_byte_size, 0); + } + } + cur_line = 0; + } + + ///////////////////////////////////////////////////////////////////////////// + void pfm_in::finalize_alloc() + { + if (alloc_p == NULL) + return; + temp_buf = alloc_p->post_alloc_data(num_comps * width, 0); + } + + ///////////////////////////////////////////////////////////////////////////// + ui32 pfm_in::read(const line_buf* line, ui32 comp_num) + { + assert(temp_buf_byte_size != 0 ); + assert(fh != 0 && comp_num < num_comps); + assert(line->size >= width); + + if (comp_num == 0) + { + si64 loc = start_of_data; + loc += (height - 1 - cur_line) * num_comps * width * sizeof(float); + if (ojph_fseek(fh, loc, SEEK_SET) != 0) + { + close(); + OJPH_ERROR(0x03000061, "Error seeking in file %s", fname); + } + size_t result = fread(temp_buf, sizeof(float), num_comps * width, fh); + if (result != num_comps * width) + { + close(); + OJPH_ERROR(0x03000062, "Not enough data in file %s", fname); + } + if (++cur_line >= height) + cur_line = 0; + } + + if (little_endian) + { + ui32 shift = 32 - bit_depth; + const float* sp = temp_buf + comp_num; + float* dp = line->f32; + if (shift) + for (ui32 i = width; i > 0; --i, sp += num_comps) + { + ui32 v = *(ui32*)sp; + v >>= shift; + *dp++ = *(float*)&v; + } + else + for (ui32 i = width; i > 0; --i, sp += num_comps) + *dp++ = *sp; + } + else { + ui32 shift = 32 - bit_depth; + const float* sp = temp_buf + comp_num; + float* dp = line->f32; + if (shift) + for (ui32 i = width; i > 0; --i, sp += num_comps) { + ui32 v = be2le(*(ui32*)sp); + v >>= shift; + *dp++ = *(float*)&v; + } + else + for (ui32 i = width; i > 0; --i, sp += num_comps) { + ui32 v = be2le(*(ui32*)sp); + *dp++ = *(float*)&v; + } + } + + return width; + } + + //////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + //////////////////////////////////////////////////////////////////////////// + + //////////////////////////////////////////////////////////////////////////// + void pfm_out::open(char* filename) + { + assert(fh == NULL && buffer == NULL); + fh = fopen(filename, "wb"); + if (fh == NULL) + OJPH_ERROR(0x03000071, + "Unable to open file %s for writing", filename); + int result = //the number of written characters + fprintf(fh, "P%c\n%d %d\n%f\n", + num_components > 1 ? 'F' : 'f', width, height, scale); + if (result == 0) + OJPH_ERROR(0x03000072, "error writing to file %s", filename); + buffer_size = width * num_components * sizeof(float); + buffer = (float*)malloc(buffer_size); + fname = filename; + cur_line = 0; + start_of_data = ojph_ftell(fh); + } + + //////////////////////////////////////////////////////////////////////////// + void pfm_out::configure(ui32 width, ui32 height, ui32 num_components, + float scale, ui32 bit_depth) + { + assert(fh == NULL); //configure before opening + if (num_components != 1 && num_components != 3) + OJPH_ERROR(0x03000081, + "pfm supports 1 or 3 colour components, not %d", num_components); + this->width = width; + this->height = height; + this->num_components = num_components; + this->scale = scale < 0.0f ? scale : -scale; + this->bit_depth = bit_depth; + } + + //////////////////////////////////////////////////////////////////////////// + ui32 pfm_out::write(const line_buf* line, ui32 comp_num) + { + assert(fh); + + ui32 shift = 32 - bit_depth; + float* dp = buffer + comp_num; + const float* sp = line->f32; + + if (shift) + for (ui32 i = width; i > 0; --i, dp += num_components, ++sp) + { + ui32 v = *(ui32*)sp; + v <<= shift; + *dp = *(float*)&v; + } + else + for (ui32 i = width; i > 0; --i, dp += num_components) + *dp = *sp++; + + if (comp_num == num_components - 1) + { + size_t samples_per_line = num_components * width; + si64 loc = start_of_data; + loc += (height - 1 - cur_line)* samples_per_line * sizeof(float); + if (ojph_fseek(fh, loc, SEEK_SET) != 0) + OJPH_ERROR(0x03000082, "Error seeking in file %s", fname); + size_t result = fwrite(buffer, sizeof(float), samples_per_line, fh); + if (result != samples_per_line) + OJPH_ERROR(0x03000083, "error writing to file %s", fname); + ++cur_line; + } + + return 0; + } + + //////////////////////////////////////////////////////////////////////////// // // // @@ -548,7 +780,7 @@ namespace ojph { { tiff_handle = NULL; if ((tiff_handle = TIFFOpen(filename, "r")) == NULL) - OJPH_ERROR(0x0300000B1, "Unable to open file %s", filename); + OJPH_ERROR(0x03000091, "Unable to open file %s", filename); fname = filename; ui32 tiff_width = 0; @@ -588,7 +820,7 @@ namespace ojph { // allocate linebuffer to hold a line of image data line_buffer = malloc(bytes_per_line); if (NULL == line_buffer) - OJPH_ERROR(0x0300000B2, "Unable to allocate %d bytes for line_buffer[] " + OJPH_ERROR(0x03000092, "Unable to allocate %d bytes for line_buffer[] " "for file %s", bytes_per_line, filename); cur_line = 0; @@ -596,7 +828,7 @@ namespace ojph { // Error on known incompatilbe input formats if( tiff_bits_per_sample != 8 && tiff_bits_per_sample != 16 ) { - OJPH_ERROR(0x0300000B3, "\nTIFF IO is currently limited" + OJPH_ERROR(0x03000093, "\nTIFF IO is currently limited" " to files with TIFFTAG_BITSPERSAMPLE=8 and TIFFTAG_BITSPERSAMPLE=16 \n" "input file = %s has TIFFTAG_BITSPERSAMPLE=%d", filename, tiff_bits_per_sample); @@ -604,14 +836,14 @@ namespace ojph { if( TIFFIsTiled( tiff_handle ) ) { - OJPH_ERROR(0x0300000B4, "\nTIFF IO is currently limited to TIF files " + OJPH_ERROR(0x03000094, "\nTIFF IO is currently limited to TIF files " "without tiles. \nInput file %s has been detected as tiled", filename); } if(PHOTOMETRIC_RGB != tiff_photometric && PHOTOMETRIC_MINISBLACK != tiff_photometric ) { - OJPH_ERROR(0x0300000B5, "\nTIFF IO is currently limited to " + OJPH_ERROR(0x03000095, "\nTIFF IO is currently limited to " "TIFFTAG_PHOTOMETRIC=PHOTOMETRIC_MINISBLACK=%d and " "PHOTOMETRIC_RGB=%d. \nInput file %s has been detected " "TIFFTAG_PHOTOMETRIC=%d", @@ -620,7 +852,7 @@ namespace ojph { if( tiff_samples_per_pixel > 4 ) { - OJPH_ERROR(0x0300000B6, "\nTIFF IO is currently limited to " + OJPH_ERROR(0x03000096, "\nTIFF IO is currently limited to " "TIFFTAG_SAMPLESPERPIXEL=4 \nInput file %s has been detected with " "TIFFTAG_SAMPLESPERPIXEL=%d", filename, tiff_samples_per_pixel); @@ -642,7 +874,7 @@ namespace ojph { line_buffer_for_planar_support_uint8 = (uint8_t*)calloc(width, sizeof(uint8_t)); if (NULL == line_buffer_for_planar_support_uint8) - OJPH_ERROR(0x0300000B7, "Unable to allocate %d bytes for " + OJPH_ERROR(0x03000097, "Unable to allocate %d bytes for " "line_buffer_for_planar_support_uint8[] for file %s", width * sizeof(uint8_t), filename); } @@ -652,7 +884,7 @@ namespace ojph { line_buffer_for_planar_support_uint16 = (uint16_t*)calloc(width, sizeof(uint16_t)); if (NULL == line_buffer_for_planar_support_uint16) - OJPH_ERROR(0x0300000B8, "Unable to allocate %d bytes for " + OJPH_ERROR(0x03000098, "Unable to allocate %d bytes for " "line_buffer_for_planar_support_uint16[] for file %s", width * sizeof(uint16_t), filename); } @@ -664,7 +896,7 @@ namespace ojph { void tif_in::set_bit_depth(ui32 num_bit_depths, ui32* bit_depth) { if (num_bit_depths < 1) - OJPH_ERROR(0x030000B9, "one or more bit_depths must be provided"); + OJPH_ERROR(0x030000A1, "one or more bit_depths must be provided"); ui32 last_bd_idx = 0; for (ui32 i = 0; i < 4; ++i) { @@ -673,7 +905,7 @@ namespace ojph { if (bd > 32 || bd < 1) { - OJPH_ERROR(0x0300000BA, + OJPH_ERROR(0x030000A2, "bit_depth = %d, this must be an integer from 1-32", bd); } this->bit_depth[i] = bd; @@ -809,20 +1041,20 @@ namespace ojph { } if (max_bitdepth > 16) { - OJPH_WARN(0x0300000C2, "TIFF output is currently limited to files " + OJPH_WARN(0x030000B1, "TIFF output is currently limited to files " "with max_bitdepth = 16, the source codestream has max_bitdepth=%d" ", the decoded data will be truncated to 16 bits", max_bitdepth); } if (num_components > 4) { - OJPH_ERROR(0x0300000C3, "TIFF IO is currently limited to files with " + OJPH_ERROR(0x030000B2, "TIFF IO is currently limited to files with " "num_components=1 to 4"); } assert(tiff_handle == NULL && buffer == NULL); if ((tiff_handle = TIFFOpen(filename, "w")) == NULL) { - OJPH_ERROR(0x0300000C1, "unable to open file %s for writing", filename); + OJPH_ERROR(0x030000B3, "unable to open file %s for writing", filename); } buffer_size = width * num_components * bytes_per_sample; @@ -1014,7 +1246,7 @@ namespace ojph { { int result = TIFFWriteScanline(tiff_handle, buffer, cur_line++); if (result != 1) - OJPH_ERROR(0x0300000C4, "error writing to file %s", fname); + OJPH_ERROR(0x030000C1, "error writing to file %s", fname); } return 0; } @@ -1034,7 +1266,7 @@ namespace ojph { assert(fh == NULL); fh = fopen(filename, "rb"); if (fh == 0) - OJPH_ERROR(0x03000051, "Unable to open file %s", filename); + OJPH_ERROR(0x030000D1, "Unable to open file %s", filename); //need to extract info from filename @@ -1062,7 +1294,7 @@ namespace ojph { if (result != width[comp_num]) { close(); - OJPH_ERROR(0x03000061, "not enough data in file %s", fname); + OJPH_ERROR(0x030000E1, "not enough data in file %s", fname); } if (bytes_per_sample[comp_num] == 1) @@ -1088,11 +1320,11 @@ namespace ojph { ui32 num_downsamplings, const point *subsampling) { if (num_components != 1 && num_components !=3) - OJPH_ERROR(0x03000071, "yuv_in support 1 or 3 components"); + OJPH_ERROR(0x030000F1, "yuv_in support 1 or 3 components"); this->num_com = num_components; if (num_downsamplings < 1) - OJPH_ERROR(0x03000072, "one or more downsampling must be provided"); + OJPH_ERROR(0x030000F2, "one or more downsampling must be provided"); ui32 last_downsamp_idx = 0; for (ui32 i = 0; i < num_components; ++i) @@ -1114,7 +1346,7 @@ namespace ojph { void yuv_in::set_bit_depth(ui32 num_bit_depths, ui32* bit_depth) { if (num_bit_depths < 1) - OJPH_ERROR(0x03000081, "one or more bit_depths must be provided"); + OJPH_ERROR(0x03000101, "one or more bit_depths must be provided"); ui32 last_bd_idx = 0; for (ui32 i = 0; i < 3; ++i) { @@ -1156,7 +1388,7 @@ namespace ojph { assert(fh == NULL); //configure before open fh = fopen(filename, "wb"); if (fh == 0) - OJPH_ERROR(0x03000091, "Unable to open file %s", filename); + OJPH_ERROR(0x03000111, "Unable to open file %s", filename); fname = filename; } @@ -1199,7 +1431,7 @@ namespace ojph { *dp++ = (ui16)val; } if (fwrite(buffer, 2, w, fh) != w) - OJPH_ERROR(0x030000A1, "unable to write to file %s", fname); + OJPH_ERROR(0x03000121, "unable to write to file %s", fname); } else { @@ -1213,7 +1445,7 @@ namespace ojph { *dp++ = (ui8)val; } if (fwrite(buffer, 1, w, fh) != w) - OJPH_ERROR(0x030000A2, "unable to write to file %s", fname); + OJPH_ERROR(0x03000122, "unable to write to file %s", fname); } return w; @@ -1233,7 +1465,7 @@ namespace ojph { assert(fh == NULL); fh = fopen(filename, "rb"); if (fh == NULL) - OJPH_ERROR(0x030000C1, "Unable to open file %s", filename); + OJPH_ERROR(0x03000131, "Unable to open file %s", filename); cur_line = 0; bytes_per_sample = (bit_depth + 7) >> 3; @@ -1251,7 +1483,7 @@ namespace ojph { if (result != width) { close(); - OJPH_ERROR(0x030000C2, "not enough data in file %s", fname); + OJPH_ERROR(0x03000132, "not enough data in file %s", fname); } if (bytes_per_sample > 3) @@ -1360,7 +1592,7 @@ namespace ojph { assert(fh == NULL); //configure before open fh = fopen(filename, "wb"); if (fh == 0) - OJPH_ERROR(0x03000091, "Unable to open file %s", filename); + OJPH_ERROR(0x03000141, "Unable to open file %s", filename); fname = filename; } @@ -1404,7 +1636,7 @@ namespace ojph { *dp++ = (ui32)val; } if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x030000B1, "unable to write to file %s", fname); + OJPH_ERROR(0x03000151, "unable to write to file %s", fname); } else if (bytes_per_sample > 2) { @@ -1420,7 +1652,7 @@ namespace ojph { dp = (ui32*)((ui8*)dp + 3); } if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x030000B2, "unable to write to file %s", fname); + OJPH_ERROR(0x03000152, "unable to write to file %s", fname); } else if (bytes_per_sample > 1) { @@ -1434,7 +1666,7 @@ namespace ojph { *dp++ = (ui16)val; } if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x030000B3, "unable to write to file %s", fname); + OJPH_ERROR(0x03000153, "unable to write to file %s", fname); } else { @@ -1448,7 +1680,7 @@ namespace ojph { *dp++ = (ui8)val; } if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x030000B4, "unable to write to file %s", fname); + OJPH_ERROR(0x03000154, "unable to write to file %s", fname); } return width; @@ -1470,7 +1702,7 @@ namespace ojph { assert(file_handle == 0); file_handle = fopen(filename, "rb"); if (0 == file_handle) - OJPH_ERROR(0x0300000D1, "Unable to open file %s", filename); + OJPH_ERROR(0x03000161, "Unable to open file %s", filename); fname = filename; // read magic number @@ -1478,7 +1710,7 @@ namespace ojph { if (fread(&magic_number, sizeof(ui32), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000D2, "Error reading file %s", filename); + OJPH_ERROR(0x03000162, "Error reading file %s", filename); } // check magic number @@ -1497,7 +1729,7 @@ namespace ojph { else { close(); - OJPH_ERROR(0x0300000D3, "Error reading file %s - this does not appear " + OJPH_ERROR(0x03000163, "Error reading file %s - this does not appear " "to be a valid DPX file. It has magic number = 0x%08X. The magic " "number of a DPX file is 0x%08X.", filename, magic_number, dpx_magic_number); @@ -1508,7 +1740,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000D4, "Error reading file %s", filename); + OJPH_ERROR(0x03000164, "Error reading file %s", filename); } if (is_byte_swapping_necessary) offset_to_image_data_in_bytes = be2le(offset_to_image_data_in_bytes); @@ -1516,14 +1748,14 @@ namespace ojph { if (fread(version, sizeof(uint8_t), 8, file_handle) != 8) { close(); - OJPH_ERROR(0x0300000D5, "Error reading file %s", filename); + OJPH_ERROR(0x03000165, "Error reading file %s", filename); } // read image file size in bytes if (fread(&total_image_file_size_in_bytes, sizeof(ui32), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000D6, "Error reading file %s", filename); + OJPH_ERROR(0x03000166, "Error reading file %s", filename); } if (is_byte_swapping_necessary) total_image_file_size_in_bytes = be2le(total_image_file_size_in_bytes); @@ -1532,14 +1764,14 @@ namespace ojph { if (fseek(file_handle,768, SEEK_SET) != 0) { close(); - OJPH_ERROR(0x0300000D7, "Error reading file %s", filename); + OJPH_ERROR(0x03000167, "Error reading file %s", filename); } // read image_orientation if (fread(&image_orientation, sizeof(uint16_t), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000D8, "Error reading file %s", filename); + OJPH_ERROR(0x03000168, "Error reading file %s", filename); } if (is_byte_swapping_necessary) image_orientation = be2le(image_orientation); @@ -1549,7 +1781,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000D9, "Error reading file %s", filename); + OJPH_ERROR(0x03000169, "Error reading file %s", filename); } if (is_byte_swapping_necessary) number_of_image_elements = be2le(number_of_image_elements); @@ -1558,7 +1790,7 @@ namespace ojph { if (fread(&pixels_per_line, sizeof(ui32), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000DA, "Error reading file %s", filename); + OJPH_ERROR(0x0300016A, "Error reading file %s", filename); } if (is_byte_swapping_necessary) pixels_per_line = be2le(pixels_per_line); @@ -1567,7 +1799,7 @@ namespace ojph { if (fread(&lines_per_image_element, sizeof(ui32), 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000DB, "Error reading file %s", filename); + OJPH_ERROR(0x0300016B, "Error reading file %s", filename); } if (is_byte_swapping_necessary) lines_per_image_element = be2le(lines_per_image_element); @@ -1576,7 +1808,7 @@ namespace ojph { if (fseek(file_handle, 780, SEEK_SET) != 0) { close(); - OJPH_ERROR(0x0300000DC, "Error reading file %s", filename); + OJPH_ERROR(0x0300016C, "Error reading file %s", filename); } // read data sign for image element @@ -1584,7 +1816,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000DE, "Error reading file %s", filename); + OJPH_ERROR(0x0300016E, "Error reading file %s", filename); } if (is_byte_swapping_necessary) data_sign_for_image_element_1 = be2le(data_sign_for_image_element_1); @@ -1593,7 +1825,7 @@ namespace ojph { if (fseek(file_handle, 800, SEEK_SET) != 0) { close(); - OJPH_ERROR(0x0300000DF, "Error reading file %s", filename); + OJPH_ERROR(0x0300016F, "Error reading file %s", filename); } // read descriptor @@ -1601,7 +1833,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000E0, "Error reading file %s", filename); + OJPH_ERROR(0x03000170, "Error reading file %s", filename); } // read transfer characteristic @@ -1609,7 +1841,7 @@ namespace ojph { 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000E1, "Error reading file %s", filename); + OJPH_ERROR(0x03000171, "Error reading file %s", filename); } // read colorimetric specification @@ -1617,7 +1849,7 @@ namespace ojph { 1, file_handle) != 1) { close(); - OJPH_ERROR(0x0300000E2, "Error reading file %s", filename); + OJPH_ERROR(0x03000172, "Error reading file %s", filename); } // read bit depth @@ -1625,7 +1857,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000E3, "Error reading file %s", filename); + OJPH_ERROR(0x03000173, "Error reading file %s", filename); } // read packing @@ -1633,7 +1865,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000E4, "Error reading file %s", filename); + OJPH_ERROR(0x03000174, "Error reading file %s", filename); } if (is_byte_swapping_necessary) packing_for_image_element_1 = be2le(packing_for_image_element_1); @@ -1643,7 +1875,7 @@ namespace ojph { != 1) { close(); - OJPH_ERROR(0x0300000E5, "Error reading file %s", filename); + OJPH_ERROR(0x03000175, "Error reading file %s", filename); } if (is_byte_swapping_necessary) encoding_for_image_element_1 = be2le(encoding_for_image_element_1); @@ -1653,7 +1885,7 @@ namespace ojph { file_handle) != 1) { close(); - OJPH_ERROR(0x0300000E6, "Error reading file %s", filename); + OJPH_ERROR(0x03000176, "Error reading file %s", filename); } if (is_byte_swapping_necessary) offset_to_data_for_image_element_1 = @@ -1663,7 +1895,7 @@ namespace ojph { if (fseek(file_handle, (long)offset_to_image_data_in_bytes, SEEK_SET) != 0) { close(); - OJPH_ERROR(0x0300000E7, "Error reading file %s", filename); + OJPH_ERROR(0x03000177, "Error reading file %s", filename); } // set ojph properties @@ -1689,7 +1921,7 @@ namespace ojph { // allocate linebuffer to hold a line of image data from the file line_buffer = malloc(number_of_32_bit_words_per_line * sizeof(ui32) ); if (NULL == line_buffer) - OJPH_ERROR(0x0300000E8, "Unable to allocate %d bytes for line_buffer[] " + OJPH_ERROR(0x03000178, "Unable to allocate %d bytes for line_buffer[] " "for file %s", number_of_32_bit_words_per_line * sizeof(ui32), filename); @@ -1697,7 +1929,7 @@ namespace ojph { line_buffer_16bit_samples = (ui16*) malloc(width * num_comps * sizeof(ui16)); if (NULL == line_buffer_16bit_samples) - OJPH_ERROR(0x0300000E9, "Unable to allocate %d bytes for " + OJPH_ERROR(0x03000179, "Unable to allocate %d bytes for " "line_buffer_16bit_samples[] for file %s", width * num_comps * sizeof(ui16), filename); @@ -1719,7 +1951,7 @@ namespace ojph { file_handle) != number_of_32_bit_words_per_line) { close(); - OJPH_ERROR(0x0300000F1, "Error reading file %s", fname); + OJPH_ERROR(0x03000181, "Error reading file %s", fname); } if (true == is_byte_swapping_necessary) @@ -1773,7 +2005,7 @@ namespace ojph { } else { - OJPH_ERROR(0x0300000F2, "file %s uses DPX image formats that are not " + OJPH_ERROR(0x03000182, "file %s uses DPX image formats that are not " "yet supported by this software\n bitdepth_for_image_element_1 = " "%d\n num_comps=%d\npacking_for_image_element_1=%d\n " "descriptor_for_image_element_1=%d", fname, diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index f250c130..0ef3fd2b 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -373,13 +373,13 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void param_nlt::set_type3_transformation(ui16 comp_num, bool enable) + void param_nlt::set_type3_transformation(ui32 comp_num, bool enable) { state->set_type3_transformation(comp_num, enable); } ////////////////////////////////////////////////////////////////////////// - bool param_nlt::get_type3_transformation(ui16 comp_num, ui8& bit_depth, + bool param_nlt::get_type3_transformation(ui32 comp_num, ui8& bit_depth, bool& is_signed) { return state->get_type3_transformation(comp_num, bit_depth, is_signed); @@ -1243,7 +1243,7 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void param_nlt::check_validity(const param_siz& siz) + void param_nlt::check_validity(param_siz& siz) { if (is_any_enabled() == false) return; @@ -1269,7 +1269,7 @@ namespace ojph { all_same_bit_depth = all_same_bit_depth && (bit_depth == siz.get_bit_depth(c)); all_same_signedness = - all_same_signedness && (is_signed != siz.is_signed(c)); + all_same_signedness && (is_signed == siz.is_signed(c)); } } else @@ -1308,6 +1308,10 @@ namespace ojph { } trim_non_existing_components(num_comps); + + if (is_any_enabled() == false) + return; + siz.set_Rsiz_flag(param_siz::RSIZ_EXT_FLAG | param_siz::RSIZ_NLT_FLAG); } ////////////////////////////////////////////////////////////////////////// @@ -1376,7 +1380,7 @@ namespace ojph { ui16 comp = swap_byte(*(ui16*)(buf + 2)); param_nlt* p = this; - if (comp != 65535) + if (comp != special_comp_num::ALL_COMPS) { p = get_comp_object(comp); if (p == NULL) diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index fa2f6906..225ad996 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -166,13 +166,20 @@ namespace ojph { { friend ::ojph::param_siz; + public: + enum : ui16 { + RSIZ_NLT_FLAG = 0x200, + RSIZ_HT_FLAG = 0x4000, + RSIZ_EXT_FLAG = 0x8000, + }; + public: param_siz() { memset(this, 0, sizeof(param_siz)); cptr = store; old_Csiz = 4; - Rsiz = 0x4000; //for jph, bit 14 of Rsiz is 1 + Rsiz = RSIZ_HT_FLAG; } ~param_siz() @@ -274,6 +281,11 @@ namespace ojph { bool is_ws_kern_support_needed() { return ws_kern_support_needed; } bool is_dfs_support_needed() { return dfs_support_needed; } + void set_Rsiz_flag(ui16 flag) + { Rsiz |= flag; } + void reset_Rsiz_flag(ui16 flag) + { Rsiz &= ~flag; } + private: ui16 Lsiz; ui16 Rsiz; @@ -670,10 +682,11 @@ namespace ojph { // data structures used by param_nlt struct param_nlt { + using special_comp_num = ojph::param_nlt::special_comp_num; public: param_nlt() { Lnlt = 6; - Cnlt = 65535; // default + Cnlt = special_comp_num::ALL_COMPS; // default BDnlt = 0; Tnlt = 3; enabled = false; next = NULL; alloced_next = false; @@ -686,7 +699,7 @@ namespace ojph { } } - void check_validity(const param_siz& siz); + void check_validity(param_siz& siz); void set_type3_transformation(ui32 comp_num, bool enable); bool get_type3_transformation(ui32 comp_num, ui8& bit_depth, bool& is_signed) const; diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 29377e73..281e1564 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -269,7 +269,7 @@ namespace ojph { memcpy(dp, sp, comp_width * sizeof(si32)); } else - cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); + cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); } else { diff --git a/src/core/common/ojph_params.h b/src/core/common/ojph_params.h index 5a74160d..602fd999 100644 --- a/src/core/common/ojph_params.h +++ b/src/core/common/ojph_params.h @@ -139,6 +139,8 @@ namespace ojph { */ class OJPH_EXPORT param_nlt { + public: + enum special_comp_num : ui16 { ALL_COMPS = 65535 }; public: param_nlt(local::param_nlt* p) : state(p) {} @@ -153,7 +155,7 @@ namespace ojph { * @param enable: true to enable nlt type 3 for this component or the default setting, false to disable nlt type 3. */ - void set_type3_transformation(ui16 comp_num, bool enable); + void set_type3_transformation(ui32 comp_num, bool enable); /** * @brief get the state (enabled or disabled) of type 3 nonlinearity @@ -164,7 +166,7 @@ namespace ojph { * @param is_signed: returns true if the component/default is signed * @return true if enabled or false if not. */ - bool get_type3_transformation(ui16 comp_num, ui8& bit_depth, + bool get_type3_transformation(ui32 comp_num, ui8& bit_depth, bool& is_signed); private: From 2a36ea4ab1a8456292b5798828d2148ae42db9ff Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 20 Sep 2024 13:16:18 +1000 Subject: [PATCH 18/78] Bug fixes, pfm with different bit_depth/signedness are supported now. --- src/apps/common/ojph_img_io.h | 17 ++++--- src/apps/ojph_compress/ojph_compress.cpp | 33 ++++++++++-- src/apps/ojph_expand/ojph_expand.cpp | 5 +- src/apps/others/ojph_img_io.cpp | 11 ++-- src/core/codestream/ojph_params.cpp | 64 +++++++++++++++--------- src/core/codestream/ojph_params_local.h | 1 + 6 files changed, 89 insertions(+), 42 deletions(-) diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h index 7fddf5ba..5f6488f7 100644 --- a/src/apps/common/ojph_img_io.h +++ b/src/apps/common/ojph_img_io.h @@ -463,7 +463,7 @@ namespace ojph { alloc_p = p; temp_buf = NULL; temp_buf_byte_size = 0; - bit_depth = 32; + bit_depth[0] = bit_depth[1] = bit_depth[2] = 32; scale = 0.0f; little_endian = true; width = height = num_comps = 0; @@ -480,9 +480,10 @@ namespace ojph { void open(const char* filename); void finalize_alloc(); - void configure(ui32 bit_depth) { - assert(bit_depth > 0 && bit_depth <= 32); - this->bit_depth = bit_depth; + void configure(ui32* bit_depth) { + assert(num_comps != 0); + for (ui32 c = 0; c < num_comps; ++c) + this->bit_depth[c] = bit_depth[c]; } virtual ui32 read(const line_buf* line, ui32 comp_num); void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; } @@ -498,7 +499,7 @@ namespace ojph { mem_fixed_allocator *alloc_p; float *temp_buf; ui32 temp_buf_byte_size; - ui32 bit_depth; // this truncates data to bit_depth in the LSB + ui32 bit_depth[3]; // this truncates data to bit_depth in the LSB float scale; bool little_endian; ui32 width, height, num_comps; @@ -783,7 +784,7 @@ namespace ojph { buffer_size = 0; width = height = num_components = 0; scale = -1.0f; - bit_depth = 32; + bit_depth[0] = bit_depth[1] = bit_depth[2] = 32; cur_line = 0; start_of_data = 0; } @@ -796,7 +797,7 @@ namespace ojph { void open(char* filename); void configure(ui32 width, ui32 height, ui32 num_components, - float scale, ui32 bit_depth); + float scale, ui32* bit_depth); virtual ui32 write(const line_buf* line, ui32 comp_num); virtual void close() { if(fh) { fclose(fh); fh = NULL; } fname = NULL; } @@ -807,7 +808,7 @@ namespace ojph { ui32 buffer_size; ui32 width, height, num_components; float scale; - ui32 bit_depth; + ui32 bit_depth[3]; ui32 cur_line; si64 start_of_data; }; diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index e1a67029..0ba6a9ef 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -747,7 +747,28 @@ int main(int argc, char * argv[]) { assert(num_comps == 1 || num_comps == 3); siz.set_num_components(num_comps); - pfm.configure(bit_depth[0]); + if (bit_depth[0] != 0) // one was set + if (num_bit_depths < num_comps) // but if not enough, repeat + for (ojph::ui32 c = num_bit_depths; c < num_comps; ++c) + bit_depth[c] = bit_depth[num_bit_depths - 1]; + if (is_signed[0] != -1) // one was set + if (num_is_signed < num_comps) // but if not enough, repeat + for (ojph::ui32 c = num_is_signed; c < num_comps; ++c) + is_signed[c] = is_signed[num_is_signed - 1]; + + bool all_the_same = true; + if (num_comps == 3) + { + all_the_same = all_the_same + && bit_depth[0] == bit_depth[1] + && bit_depth[1] == bit_depth[2]; + all_the_same = all_the_same + && is_signed[0] == is_signed[1] + && is_signed[1] == is_signed[2]; + } + + pfm.configure(bit_depth); + ojph::point ds(1, 1); for (ojph::ui32 c = 0; c < num_comps; ++c) { ojph::ui32 bd = 32; if (bit_depth[c] != 0) @@ -755,7 +776,6 @@ int main(int argc, char * argv[]) { bool is = false; if (is_signed[c] != -1) is = is_signed[c] != 0; - ojph::point ds(1, 1); siz.set_component(c, ds, bd, is); } siz.set_image_offset(image_offset); @@ -777,8 +797,13 @@ int main(int argc, char * argv[]) { codestream.access_qcd().set_irrev_quant(quantization_step); ojph::param_nlt nlt = codestream.access_nlt(); - if (reversible) - nlt.set_type3_transformation(ojph::param_nlt::ALL_COMPS, true); + if (reversible) { + if (all_the_same) + nlt.set_type3_transformation(ojph::param_nlt::ALL_COMPS, true); + else + for (ojph::ui32 c = 0; c < num_comps; ++c) + nlt.set_type3_transformation(c, true); + } else OJPH_ERROR(0x01000091, "The support for pfm image is not " "complete; I need to figure how to modify the interface " diff --git a/src/apps/ojph_expand/ojph_expand.cpp b/src/apps/ojph_expand/ojph_expand.cpp index 2adb535d..c3940389 100644 --- a/src/apps/ojph_expand/ojph_expand.cpp +++ b/src/apps/ojph_expand/ojph_expand.cpp @@ -288,8 +288,11 @@ int main(int argc, char *argv[]) { OJPH_ERROR(0x0200000D, "To save an image to ppm, all the components must have the " "same downsampling ratio\n"); + ojph::ui32 bit_depth[3]; + for (ojph::ui32 c = 0; c < siz.get_num_components(); ++c) + bit_depth[c] = siz.get_bit_depth(c); pfm.configure(siz.get_recon_width(0), siz.get_recon_height(0), - siz.get_num_components(), -1.0f, siz.get_bit_depth(0)); + siz.get_num_components(), -1.0f, bit_depth); pfm.open(output_filename); base = &pfm; } diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index 93dd4534..7dcdd4bf 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -654,7 +654,7 @@ namespace ojph { if (little_endian) { - ui32 shift = 32 - bit_depth; + ui32 shift = 32 - bit_depth[comp_num]; const float* sp = temp_buf + comp_num; float* dp = line->f32; if (shift) @@ -669,7 +669,7 @@ namespace ojph { *dp++ = *sp; } else { - ui32 shift = 32 - bit_depth; + ui32 shift = 32 - bit_depth[comp_num]; const float* sp = temp_buf + comp_num; float* dp = line->f32; if (shift) @@ -718,7 +718,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// void pfm_out::configure(ui32 width, ui32 height, ui32 num_components, - float scale, ui32 bit_depth) + float scale, ui32* bit_depth) { assert(fh == NULL); //configure before opening if (num_components != 1 && num_components != 3) @@ -728,7 +728,8 @@ namespace ojph { this->height = height; this->num_components = num_components; this->scale = scale < 0.0f ? scale : -scale; - this->bit_depth = bit_depth; + for (ui32 c = 0; c < num_components; ++c) + this->bit_depth[c] = bit_depth[c]; } //////////////////////////////////////////////////////////////////////////// @@ -736,7 +737,7 @@ namespace ojph { { assert(fh); - ui32 shift = 32 - bit_depth; + ui32 shift = 32 - bit_depth[comp_num]; float* dp = buffer + comp_num; const float* sp = line->f32; diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index 0ef3fd2b..dd4692c0 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -1248,10 +1248,13 @@ namespace ojph { if (is_any_enabled() == false) return; - bool all_same_bit_depth = true; - bool all_same_signedness = true; + bool all_same = true; ui32 num_comps = siz.get_num_components(); + // first stage; find out if all components captured by the default + // entry (ALL_COMPS) has the same bit_depth/signedness, + // while doing this, set the BDnlt for components not captured but the + // default entry (ALL_COMPS) ui32 bit_depth = 0; // unknown yet bool is_signed = false; // unknown yet for (ui32 c = 0; c < num_comps; ++c) @@ -1266,10 +1269,8 @@ namespace ojph { } else { // we have seen an undefined component previously - all_same_bit_depth = - all_same_bit_depth && (bit_depth == siz.get_bit_depth(c)); - all_same_signedness = - all_same_signedness && (is_signed == siz.is_signed(c)); + all_same = all_same && (bit_depth == siz.get_bit_depth(c)); + all_same = all_same && (is_signed == siz.is_signed(c)); } } else @@ -1279,26 +1280,41 @@ namespace ojph { } } + // If the default entry is enabled/used, then if the components captured + // by it are not the same, we need to create entries for these + // components if (this->enabled) { if (bit_depth != 0) // default captures some components { + // captures at least one of the componets in the default entry this->BDnlt = (ui8)((bit_depth - 1) | (is_signed ? 0x80 : (ui8)0)); - if (!all_same_bit_depth || !all_same_signedness) + + if (!all_same) { - // We cannot use the default for all undefined components, so we - // will keep it and set it to the values of the first undefined - // component, but we will also define that component + // We cannot use the default for all components in it, so we + // will keep the first one, and we will also define other + // components on their own. for (ui32 c = 0; c < num_comps; ++c) { - param_nlt* p = get_comp_object(c); - if (p == NULL) { - // values were defined previously for (p && enabled) - p = add_object(c); - p->enabled = true; - p->BDnlt = (ui8)(siz.get_bit_depth(c) - 1); - p->BDnlt = (ui8)(p->BDnlt | (siz.is_signed(c) ? 0x80 : 0)); + ui16 bd = siz.get_bit_depth(c); + bool is = siz.is_signed(c); + if (bd != bit_depth || is != is_signed) + { + // this component has different bit_depth/signedness than the + // default (ALL_COMPS) entry + param_nlt* p = get_comp_object(c); + if (p == NULL || !p->enabled) + { + // this component is captured by the default (ALL_COMPS) + // entry (because it is either not in the list, or + // not enabled + if (p == NULL) + p = add_object(c); + p->enabled = true; + p->BDnlt = (ui8)((bd - 1) | (is ? 0x80 : 0)); + } } } } @@ -1354,12 +1370,12 @@ namespace ojph { *(ui16*)buf = JP2K_MARKER::NLT; *(ui16*)buf = swap_byte(*(ui16*)buf); result &= file->write(&buf, 2) == 2; - *(ui16*)buf = swap_byte(Lnlt); + *(ui16*)buf = swap_byte(p->Lnlt); result &= file->write(&buf, 2) == 2; - *(ui16*)buf = swap_byte(Cnlt); + *(ui16*)buf = swap_byte(p->Cnlt); result &= file->write(&buf, 2) == 2; - result &= file->write(&BDnlt, 1) == 1; - result &= file->write(&Tnlt, 1) == 1; + result &= file->write(&p->BDnlt, 1) == 1; + result &= file->write(&p->Tnlt, 1) == 1; } p = p->next; } @@ -1423,9 +1439,9 @@ namespace ojph { p = p->next; } p->next = new param_nlt; + p->alloced_next = true; p = p->next; p->Cnlt = (ui16)comp_num; - p->alloced_next = true; return p; } @@ -1444,8 +1460,8 @@ namespace ojph { { param_nlt* p = this->next; while (p) { - if (p->enabled == true && p->Cnlt >= num_comps) - p->enabled = false; + if (p->enabled == true && p->Cnlt >= num_comps) + p->enabled = false; p = p->next; } } diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 225ad996..8415e079 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -695,6 +695,7 @@ namespace ojph { ~param_nlt() { if (next && alloced_next) { delete next; + alloced_next = false; next = NULL; } } From 27c9a2e6ef637b58c47d5cc85f0e0ad02c053690 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 20 Sep 2024 13:56:00 +1000 Subject: [PATCH 19/78] This fixes the one component case. --- src/apps/ojph_compress/ojph_compress.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index 0ba6a9ef..753eeaf0 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -788,10 +788,20 @@ int main(int argc, char * argv[]) { if (num_precincts != -1) cod.set_precinct_size(num_precincts, precinct_size); cod.set_progression_order(prog_order); - if (employ_color_transform == -1) - cod.set_color_transform(true); + if (num_comps == 1) + { + if (employ_color_transform != -1) + OJPH_WARN(0x01000016, + "-colour_trans option is not needed and was not used; " + "this is because the image has one component only\n"); + } else - cod.set_color_transform(employ_color_transform == 1); + { + if (employ_color_transform == -1) + cod.set_color_transform(true); + else + cod.set_color_transform(employ_color_transform == 1); + } cod.set_reversible(reversible); if (!reversible && quantization_step != -1.0f) codestream.access_qcd().set_irrev_quant(quantization_step); From 0c6dfd3054ee211cb5ed81626f2a6d1ee41fb232 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 20 Sep 2024 15:31:25 +1000 Subject: [PATCH 20/78] Removes compilation warnings. --- src/apps/common/ojph_img_io.h | 4 ++-- src/core/codestream/ojph_params.cpp | 2 +- src/core/codestream/ojph_params_local.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h index 5f6488f7..27ad9f77 100644 --- a/src/apps/common/ojph_img_io.h +++ b/src/apps/common/ojph_img_io.h @@ -498,7 +498,7 @@ namespace ojph { const char *fname; mem_fixed_allocator *alloc_p; float *temp_buf; - ui32 temp_buf_byte_size; + size_t temp_buf_byte_size; ui32 bit_depth[3]; // this truncates data to bit_depth in the LSB float scale; bool little_endian; @@ -621,7 +621,7 @@ namespace ojph { ui32 width, height, num_components; ui32 bit_depth, bytes_per_sample; ui8* buffer; - ui32 buffer_size; + size_t buffer_size; ui32 cur_line, samples_per_line, bytes_per_line; conversion_fun converter; const line_buf *lptr[3]; diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index dd4692c0..2bd3987f 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -1298,7 +1298,7 @@ namespace ojph { for (ui32 c = 0; c < num_comps; ++c) { - ui16 bd = siz.get_bit_depth(c); + ui32 bd = siz.get_bit_depth(c); bool is = siz.is_signed(c); if (bd != bit_depth || is != is_signed) { diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 8415e079..ac8bb776 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -284,7 +284,7 @@ namespace ojph { void set_Rsiz_flag(ui16 flag) { Rsiz |= flag; } void reset_Rsiz_flag(ui16 flag) - { Rsiz &= ~flag; } + { Rsiz = (ui16)(Rsiz & ~flag); } private: ui16 Lsiz; From 9f8011ce790b1d014ebcd7e0a6842730bf14e739 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 20 Sep 2024 20:19:18 +1000 Subject: [PATCH 21/78] This fixes a bug in the block decoder code. The bug can happen when the data is around 16bits/sample of more, coded losslessly. --- src/core/codestream/ojph_codeblock.cpp | 4 ++-- src/core/coding/ojph_block_decoder.cpp | 14 +++++++------- src/core/coding/ojph_block_decoder_avx2.cpp | 14 +++++++------- src/core/coding/ojph_block_decoder_ssse3.cpp | 14 +++++++------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp index 9a63ca19..09159513 100644 --- a/src/core/codestream/ojph_codeblock.cpp +++ b/src/core/codestream/ojph_codeblock.cpp @@ -152,11 +152,11 @@ namespace ojph { if (result == false) { if (resilient == true) { - OJPH_INFO(0x000300A1, "Error decoding a codeblock"); + OJPH_INFO(0x000300A1, "Error decoding a codeblock."); zero_block = true; } else - OJPH_ERROR(0x000300A1, "Error decoding a codeblock"); + OJPH_ERROR(0x000300A1, "Error decoding a codeblock."); } } else diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder.cpp index 5be5430a..259371b8 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder.cpp @@ -753,14 +753,14 @@ namespace ojph { { OJPH_WARN(0x00010001, "A malformed codeblock that has more than " "one coding pass, but zero length for " - "2nd and potential 3rd pass"); + "2nd and potential 3rd pass."); num_passes = 1; } if (num_passes > 3) { OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " - "This codeblocks has %d passes", + "This codeblocks has %d passes.", num_passes); return false; } @@ -772,7 +772,7 @@ namespace ojph { insufficient_precision = true; OJPH_WARN(0x00010003, "32 bits are not enough to decode this " "codeblock. This message will not be " - "displayed again"); + "displayed again."); } return false; } @@ -783,7 +783,7 @@ namespace ojph { OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " "pass. The code can be modified to support " "this case. This message will not be " - "displayed again"); + "displayed again."); } return false; // 32 bits are not enough to decode this } @@ -796,7 +796,7 @@ namespace ojph { OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " "nor MagRef passes; both will be skipped. " "This message will not be displayed " - "again"); + "again."); } } } @@ -806,7 +806,7 @@ namespace ojph { if (lengths1 < 2) { - OJPH_WARN(0x00010006, "Wrong codeblock length"); + OJPH_WARN(0x00010006, "Wrong codeblock length."); return false; } @@ -1079,7 +1079,7 @@ namespace ojph { // quad 0 length len = uvlc_entry & 0x7; // quad 0 suffix length uvlc_entry >>= 3; - ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q + ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); sp[1] = u_q; u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q sp[3] = u_q; diff --git a/src/core/coding/ojph_block_decoder_avx2.cpp b/src/core/coding/ojph_block_decoder_avx2.cpp index e7270a75..156ba1af 100644 --- a/src/core/coding/ojph_block_decoder_avx2.cpp +++ b/src/core/coding/ojph_block_decoder_avx2.cpp @@ -1077,14 +1077,14 @@ namespace ojph { { OJPH_WARN(0x00010001, "A malformed codeblock that has more than " "one coding pass, but zero length for " - "2nd and potential 3rd pass"); + "2nd and potential 3rd pass."); num_passes = 1; } if (num_passes > 3) { OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " - "This codeblocks has %d passes", + "This codeblocks has %d passes.", num_passes); return false; } @@ -1096,7 +1096,7 @@ namespace ojph { insufficient_precision = true; OJPH_WARN(0x00010003, "32 bits are not enough to decode this " "codeblock. This message will not be " - "displayed again"); + "displayed again."); } return false; } @@ -1107,7 +1107,7 @@ namespace ojph { OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " "pass. The code can be modified to support " "this case. This message will not be " - "displayed again"); + "displayed again."); } return false; // 32 bits are not enough to decode this } @@ -1120,7 +1120,7 @@ namespace ojph { OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " "nor MagRef passes; both will be skipped. " "This message will not be displayed " - "again"); + "again."); } } } @@ -1130,7 +1130,7 @@ namespace ojph { if (lengths1 < 2) { - OJPH_WARN(0x00010006, "Wrong codeblock length"); + OJPH_WARN(0x00010006, "Wrong codeblock length."); return false; } @@ -1407,7 +1407,7 @@ namespace ojph { // quad 0 length len = uvlc_entry & 0x7; // quad 0 suffix length uvlc_entry >>= 3; - ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q + ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); sp[1] = u_q; u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q sp[3] = u_q; diff --git a/src/core/coding/ojph_block_decoder_ssse3.cpp b/src/core/coding/ojph_block_decoder_ssse3.cpp index 99ae38cb..9fa58006 100644 --- a/src/core/coding/ojph_block_decoder_ssse3.cpp +++ b/src/core/coding/ojph_block_decoder_ssse3.cpp @@ -1033,14 +1033,14 @@ namespace ojph { { OJPH_WARN(0x00010001, "A malformed codeblock that has more than " "one coding pass, but zero length for " - "2nd and potential 3rd pass"); + "2nd and potential 3rd pass."); num_passes = 1; } if (num_passes > 3) { OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " - "This codeblocks has %d passes", + "This codeblocks has %d passes.", num_passes); return false; } @@ -1052,7 +1052,7 @@ namespace ojph { insufficient_precision = true; OJPH_WARN(0x00010003, "32 bits are not enough to decode this " "codeblock. This message will not be " - "displayed again"); + "displayed again."); } return false; } @@ -1063,7 +1063,7 @@ namespace ojph { OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " "pass. The code can be modified to support " "this case. This message will not be " - "displayed again"); + "displayed again."); } return false; // 32 bits are not enough to decode this } @@ -1076,7 +1076,7 @@ namespace ojph { OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " "nor MagRef passes; both will be skipped. " "This message will not be displayed " - "again"); + "again."); } } } @@ -1086,7 +1086,7 @@ namespace ojph { if (lengths1 < 2) { - OJPH_WARN(0x00010006, "Wrong codeblock length"); + OJPH_WARN(0x00010006, "Wrong codeblock length."); return false; } @@ -1361,7 +1361,7 @@ namespace ojph { // quad 0 length len = uvlc_entry & 0x7; // quad 0 suffix length uvlc_entry >>= 3; - ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q + ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); sp[1] = u_q; u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q sp[3] = u_q; From 7b0f5c89c0d3274c2370247541f62244e337be5d Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 20 Sep 2024 22:06:08 +1000 Subject: [PATCH 22/78] Improvement for pfm_in, and a bug fix for handling signed pfm data. --- src/apps/others/ojph_img_io.cpp | 7 ++++--- src/core/transform/ojph_colour.cpp | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index 7dcdd4bf..d3211d71 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -660,7 +660,7 @@ namespace ojph { if (shift) for (ui32 i = width; i > 0; --i, sp += num_comps) { - ui32 v = *(ui32*)sp; + si32 v = *(si32*)sp; v >>= shift; *dp++ = *(float*)&v; } @@ -675,8 +675,9 @@ namespace ojph { if (shift) for (ui32 i = width; i > 0; --i, sp += num_comps) { ui32 v = be2le(*(ui32*)sp); - v >>= shift; - *dp++ = *(float*)&v; + si32 u = *(si32*)&v; + u >>= shift; + *dp++ = *(float*)&u; } else for (ui32 i = width; i > 0; --i, sp += num_comps) { diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 34161d43..ca96d2d1 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -214,7 +214,7 @@ namespace ojph { { for (ui32 i = width; i > 0; --i) { const si32 v = *sp++; - *dp++ = v > 0 ? v : (- v - shift); + *dp++ = v >= 0 ? v : (- v - shift); } } From b6ddad9ff89ab93161b9ea1cfccc8d2503699184 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 20 Sep 2024 22:37:27 +1000 Subject: [PATCH 23/78] This should address "strict-aliasing". Cosmetic change to a wasm file. Also addresses CodeQL. --- src/apps/others/ojph_img_io.cpp | 80 ++++++++++++++----------- src/core/transform/ojph_colour_wasm.cpp | 4 +- 2 files changed, 47 insertions(+), 37 deletions(-) diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index d3211d71..572dd5ec 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -593,11 +593,11 @@ namespace ojph { start_of_data = ojph_ftell(fh); // alloc. linebuffer to hold a line of image data, if more than 1 comp. - if (temp_buf_byte_size < num_comps * width * sizeof(float)) + if (temp_buf_byte_size < num_comps * (size_t)width * sizeof(float)) { if (alloc_p == NULL) { - temp_buf_byte_size = num_comps * width * sizeof(float); + temp_buf_byte_size = num_comps * (size_t)width * sizeof(float); void* t = temp_buf; if (temp_buf) temp_buf = (float*)realloc(temp_buf, temp_buf_byte_size); @@ -611,7 +611,7 @@ namespace ojph { else { assert(temp_buf_byte_size == 0); //cannot reallocate the buffer - temp_buf_byte_size = num_comps * width * sizeof(float); + temp_buf_byte_size = num_comps * (size_t)width * sizeof(float); alloc_p->pre_alloc_data(temp_buf_byte_size, 0); } } @@ -623,7 +623,7 @@ namespace ojph { { if (alloc_p == NULL) return; - temp_buf = alloc_p->post_alloc_data(num_comps * width, 0); + temp_buf = alloc_p->post_alloc_data(num_comps * (size_t)width, 0); } ///////////////////////////////////////////////////////////////////////////// @@ -636,13 +636,14 @@ namespace ojph { if (comp_num == 0) { si64 loc = start_of_data; - loc += (height - 1 - cur_line) * num_comps * width * sizeof(float); + loc += (size_t)(height-1 - cur_line) * num_comps * width * sizeof(float); if (ojph_fseek(fh, loc, SEEK_SET) != 0) { close(); OJPH_ERROR(0x03000061, "Error seeking in file %s", fname); } - size_t result = fread(temp_buf, sizeof(float), num_comps * width, fh); + size_t result = + fread(temp_buf, sizeof(float), num_comps * (size_t)width, fh); if (result != num_comps * width) { close(); @@ -652,38 +653,42 @@ namespace ojph { cur_line = 0; } + union { + si32* s; + ui32* u; + float* f; + } sp, dp; + if (little_endian) { ui32 shift = 32 - bit_depth[comp_num]; - const float* sp = temp_buf + comp_num; - float* dp = line->f32; + sp.f = temp_buf + comp_num; + dp.f = line->f32; if (shift) - for (ui32 i = width; i > 0; --i, sp += num_comps) + for (ui32 i = width; i > 0; --i, sp.f += num_comps) { - si32 v = *(si32*)sp; - v >>= shift; - *dp++ = *(float*)&v; + si32 s = *sp.s; + s >>= shift; + *dp.s++ = s; } else - for (ui32 i = width; i > 0; --i, sp += num_comps) - *dp++ = *sp; + for (ui32 i = width; i > 0; --i, sp.f += num_comps) + *dp.f++ = *sp.f; } else { ui32 shift = 32 - bit_depth[comp_num]; - const float* sp = temp_buf + comp_num; - float* dp = line->f32; + sp.f = temp_buf + comp_num; + dp.f = line->f32; if (shift) - for (ui32 i = width; i > 0; --i, sp += num_comps) { - ui32 v = be2le(*(ui32*)sp); - si32 u = *(si32*)&v; - u >>= shift; - *dp++ = *(float*)&u; + for (ui32 i = width; i > 0; --i, sp.f += num_comps) { + ui32 u = be2le(*sp.u); + si32 s = *(si32*)&u; + s >>= shift; + *dp.s++ = s; } else - for (ui32 i = width; i > 0; --i, sp += num_comps) { - ui32 v = be2le(*(ui32*)sp); - *dp++ = *(float*)&v; - } + for (ui32 i = width; i > 0; --i, sp.f += num_comps) + *dp.u++ = be2le(*sp.u); } return width; @@ -710,7 +715,7 @@ namespace ojph { num_components > 1 ? 'F' : 'f', width, height, scale); if (result == 0) OJPH_ERROR(0x03000072, "error writing to file %s", filename); - buffer_size = width * num_components * sizeof(float); + buffer_size = (size_t)width * num_components * sizeof(float); buffer = (float*)malloc(buffer_size); fname = filename; cur_line = 0; @@ -739,23 +744,28 @@ namespace ojph { assert(fh); ui32 shift = 32 - bit_depth[comp_num]; - float* dp = buffer + comp_num; - const float* sp = line->f32; + union { + ui32* u; + float* f; + } sp, dp; + + dp.f = buffer + comp_num; + sp.f = line->f32; if (shift) - for (ui32 i = width; i > 0; --i, dp += num_components, ++sp) + for (ui32 i = width; i > 0; --i, dp.f += num_components, ++sp.f) { - ui32 v = *(ui32*)sp; - v <<= shift; - *dp = *(float*)&v; + ui32 u = *sp.u; + u <<= shift; + *dp.u = u; } else - for (ui32 i = width; i > 0; --i, dp += num_components) - *dp = *sp++; + for (ui32 i = width; i > 0; --i, dp.f += num_components) + *dp.f = *sp.f++; if (comp_num == num_components - 1) { - size_t samples_per_line = num_components * width; + size_t samples_per_line = num_components * (size_t)width; si64 loc = start_of_data; loc += (height - 1 - cur_line)* samples_per_line * sizeof(float); if (ojph_fseek(fh, loc, SEEK_SET) != 0) diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index e174d0b1..57b84c7e 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -133,8 +133,8 @@ namespace ojph { v128_t s = wasm_v128_load(sp); v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value - v_m_sh = wasm_v128_and(v_m_sh, c); // keep only - shift - value - s = wasm_v128_andnot(s, c); // keep only +ve or 0 + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + s = wasm_v128_andnot(c, s); // keep only +ve or 0 s = wasm_v128_or(s, v_m_sh); // combine wasm_v128_store(dp, s); } From 1580ae79669fe86aa88c26ed42e775a2ecbd6aa8 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 20 Sep 2024 22:41:48 +1000 Subject: [PATCH 24/78] This address one more warning. --- src/apps/common/ojph_img_io.h | 2 +- src/apps/others/ojph_img_io.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h index 27ad9f77..401ad658 100644 --- a/src/apps/common/ojph_img_io.h +++ b/src/apps/common/ojph_img_io.h @@ -805,7 +805,7 @@ namespace ojph { FILE *fh; const char *fname; float* buffer; - ui32 buffer_size; + size_t buffer_size; ui32 width, height, num_components; float scale; ui32 bit_depth[3]; diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index 572dd5ec..a7a91b7c 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -643,7 +643,7 @@ namespace ojph { OJPH_ERROR(0x03000061, "Error seeking in file %s", fname); } size_t result = - fread(temp_buf, sizeof(float), num_comps * (size_t)width, fh); + fread(temp_buf, sizeof(float), (size_t)num_comps * (size_t)width, fh); if (result != num_comps * width) { close(); From 9e97f99d3e97afb62b6f84f334e9a5a9134ac8f4 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 20 Sep 2024 22:48:46 +1000 Subject: [PATCH 25/78] Hopefully, this will remove warnings. --- src/apps/others/ojph_img_io.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index a7a91b7c..d8120251 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -636,7 +636,8 @@ namespace ojph { if (comp_num == 0) { si64 loc = start_of_data; - loc += (size_t)(height-1 - cur_line) * num_comps * width * sizeof(float); + loc += (size_t)(height-1 - cur_line) * (size_t)num_comps + * (size_t)width * sizeof(float); if (ojph_fseek(fh, loc, SEEK_SET) != 0) { close(); @@ -644,7 +645,7 @@ namespace ojph { } size_t result = fread(temp_buf, sizeof(float), (size_t)num_comps * (size_t)width, fh); - if (result != num_comps * width) + if (result != (size_t)num_comps * (size_t)width) { close(); OJPH_ERROR(0x03000062, "Not enough data in file %s", fname); From bcc4d3bd394fd97b50b82677b96d3aa346f50647 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 20 Sep 2024 23:24:36 +1000 Subject: [PATCH 26/78] Added usage text for ojph_compress. Also , the end user must now specify -bit_depth for .pfm files -- this temporary. --- src/apps/ojph_compress/ojph_compress.cpp | 41 +++++++++++++++++++----- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index 753eeaf0..90c8a059 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -526,9 +526,9 @@ int main(int argc, char * argv[]) { std::cout << "\nThe following arguments are necessary:\n" #ifdef OJPH_ENABLE_TIFF_SUPPORT - " -i input file name (either pgm, ppm, tif(f), or raw(yuv))\n" + " -i input file name (either pgm, ppm, pfm, tif(f), or raw(yuv))\n" #else - " -i input file name (either pgm, ppm, or raw(yuv))\n" + " -i input file name (either pgm, ppm, pfm, or raw(yuv))\n" #endif // !OJPH_ENABLE_TIFF_SUPPORT " -o output file name\n\n" @@ -587,7 +587,28 @@ int main(int argc, char * argv[]) { " component; for example: 12,10,10\n" " -downsamp {x,y},{x,y},...,{x,y} a list of x,y points, one for each\n" " component; for example {1,1},{2,2},{2,2}\n\n" - ; + "\n" + + ".pfm files receive special treatment. Currently, lossy compression\n" + "with these files is not supported, only lossless. When these files are\n" + "used, the NLT segment marker is automatically inserted into the\n" + "codestream. For these files the following arguments can be useful\n" + " -signed a comma - separated list of true or false parameters, one\n" + " for each component; for example: true,false,false.\n" + " The sign only affects how values are treated; for negative\n" + " values the standard requires a special non-linear\n" + " transformation. When signed is false, no transformation\n" + " is employed, as we assume all values are 0 or positive.\n" + " When signed is true, the aforementioned transformation is\n" + " employed on negative values only.\n" + " -bit_depth a comma-separated list of bit depth values, one per \n" + " component; for example: 12,10,10.\n" + " Floating value numbers are treated as integers, and they\n" + " are shifted to the right, keeping only the specificed\n" + " number of bits. Note that a bit depth of 28 upwards is not\n" + " supported.\n" + + "\n"; return -1; } if (!get_arguments(argc, argv, input_filename, output_filename, @@ -747,6 +768,10 @@ int main(int argc, char * argv[]) { assert(num_comps == 1 || num_comps == 3); siz.set_num_components(num_comps); + if (bit_depth[0] == 0) + OJPH_ERROR(0x01000091, + "-bit_depth must be specified (this is temporary only).\n"); + if (bit_depth[0] != 0) // one was set if (num_bit_depths < num_comps) // but if not enough, repeat for (ojph::ui32 c = num_bit_depths; c < num_comps; ++c) @@ -791,7 +816,7 @@ int main(int argc, char * argv[]) { if (num_comps == 1) { if (employ_color_transform != -1) - OJPH_WARN(0x01000016, + OJPH_WARN(0x01000092, "-colour_trans option is not needed and was not used; " "this is because the image has one component only\n"); } @@ -815,7 +840,7 @@ int main(int argc, char * argv[]) { nlt.set_type3_transformation(c, true); } else - OJPH_ERROR(0x01000091, "The support for pfm image is not " + OJPH_ERROR(0x01000093, "The support for pfm image is not " "complete; I need to figure how to modify the interface " "to better support the exchange of floating point data. " "Feeding float point data is not supported yet, unless it " @@ -829,13 +854,13 @@ int main(int argc, char * argv[]) { codestream.request_tlm_marker(tlm_marker); if (dims.w != 0 || dims.h != 0) - OJPH_WARN(0x01000092, + OJPH_WARN(0x01000094, "-dims option is not needed and was not used\n"); if (num_components != 0) - OJPH_WARN(0x01000093, + OJPH_WARN(0x01000095, "-num_comps is not needed and was not used\n"); if (comp_downsampling[0].x != 0 || comp_downsampling[0].y != 0) - OJPH_WARN(0x01000094, + OJPH_WARN(0x01000096, "-downsamp is not needed and was not used\n"); base = &pfm; From 11307b641838a551e0e9c76ed067b49ed2226c71 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 22 Sep 2024 13:45:58 +1000 Subject: [PATCH 27/78] Change default signedness of .pfm to true. --- src/apps/ojph_compress/ojph_compress.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index 90c8a059..d87e38ba 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -798,7 +798,7 @@ int main(int argc, char * argv[]) { ojph::ui32 bd = 32; if (bit_depth[c] != 0) bd = bit_depth[c]; - bool is = false; + bool is = true; if (is_signed[c] != -1) is = is_signed[c] != 0; siz.set_component(c, ds, bd, is); From 57c7af3b757f2ca6ba7d4dca527fb33f1d11a7d5 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 22 Sep 2024 14:13:18 +1000 Subject: [PATCH 28/78] This adds some codestream checking in ojph_expand. --- src/apps/ojph_expand/ojph_expand.cpp | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/apps/ojph_expand/ojph_expand.cpp b/src/apps/ojph_expand/ojph_expand.cpp index c3940389..3d3b981a 100644 --- a/src/apps/ojph_expand/ojph_expand.cpp +++ b/src/apps/ojph_expand/ojph_expand.cpp @@ -271,6 +271,8 @@ int main(int argc, char *argv[]) { { codestream.set_planar(false); ojph::param_siz siz = codestream.access_siz(); + ojph::param_cod cod = codestream.access_cod(); + ojph::param_nlt nlt = codestream.access_nlt(); ojph::ui32 num_comps = siz.get_num_components(); if (num_comps != 3 && num_comps != 1) @@ -289,8 +291,30 @@ int main(int argc, char *argv[]) { "To save an image to ppm, all the components must have the " "same downsampling ratio\n"); ojph::ui32 bit_depth[3]; - for (ojph::ui32 c = 0; c < siz.get_num_components(); ++c) - bit_depth[c] = siz.get_bit_depth(c); + for (ojph::ui32 c = 0; c < siz.get_num_components(); ++c) { + ojph::ui8 bd = 0; + bool is = true; + bool result = nlt.get_type3_transformation(c, bd, is); + if (result == false) + OJPH_ERROR(0x0200000E, + "This codestream is not supported; it does not have an " + "NLT segment marker for this component (or no default NLT " + "settings) .\n"); + if (bd != siz.get_bit_depth(c) || is != siz.is_signed(c)) + OJPH_ERROR(0x0200000F, + "There is discrepancy in component %d configuration between " + "SIZ marker segment, which specifies bit_depth = %d and " + "signedness = %s, and NLT marker segment, which specifies " + "bit_depth = %d and signedness = %s.\n", c, + siz.get_bit_depth(c), is != siz.is_signed(c) ? "True" : "False", + bd, is ? "True" : "False"); + bit_depth[c] = bd; + } + if (!cod.is_reversible()) + OJPH_ERROR(0x02000010, + "This codestream is lossy (not reversible), and we currently " + "only support reversible codestreams for .pfm target files. " + "This is only temporary and will be changed at some point.\n"); pfm.configure(siz.get_recon_width(0), siz.get_recon_height(0), siz.get_num_components(), -1.0f, bit_depth); pfm.open(output_filename); From 796fb6bd3cfeed1276f814b0d59df04a229830b7 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 22 Sep 2024 14:42:03 +1000 Subject: [PATCH 29/78] Spelling mistake fix. --- src/apps/ojph_compress/ojph_compress.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index d87e38ba..e7c047a4 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -604,7 +604,7 @@ int main(int argc, char * argv[]) { " -bit_depth a comma-separated list of bit depth values, one per \n" " component; for example: 12,10,10.\n" " Floating value numbers are treated as integers, and they\n" - " are shifted to the right, keeping only the specificed\n" + " are shifted to the right, keeping only the specified\n" " number of bits. Note that a bit depth of 28 upwards is not\n" " supported.\n" From 77aae494a6a221d1af403a4cdea8b458c6c1caa1 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 22 Sep 2024 15:47:49 +1000 Subject: [PATCH 30/78] README.md Update --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 15d65972..bfce9832 100644 --- a/README.md +++ b/README.md @@ -17,4 +17,8 @@ The standard is available free of charge from [ITU website](https://www.itu.int/ * [Compiling and Running in Docker](./docs/docker.md) * [Usage Example](./docs/usage_examples.md) * [Web-based Demos](./docs/web_demos.md) -* [Doxygen Documentation Style](./docs/doxygen_style.md) \ No newline at end of file +* [Doxygen Documentation Style](./docs/doxygen_style.md) + +# Repositories # +[![Packaging status](https://repology.org/badge/vertical-allrepos/openjph.svg)](https://repology.org/project/openjph/versions) + From 00d233524b515a17f6ae8cf2533174eff494f6a7 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 12 Oct 2024 10:46:14 +1100 Subject: [PATCH 31/78] Updated link to GPU Encoding Paper in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bfce9832..90064a73 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Open source implementation of High-throughput JPEG2000 (HTJ2K), also known as JPH, JPEG2000 Part 15, ISO/IEC 15444-15, and ITU-T T.814. Here, we are interested in implementing the HTJ2K only, supporting features that are defined in JPEG2000 Part 1 (for example, for wavelet transform, only reversible 5/3 and irreversible 9/7 are supported). -The interested reader is referred to the [short HTJ2K white paper](http://ds.jpeg.org/whitepapers/jpeg-htj2k-whitepaper.pdf), or the [extended HTJ2K white paper](https://htj2k.com/wp-content/uploads/white-paper.pdf) for more details on HTJ2K. [This](https://kakadusoftware.com/wp-content/uploads/icip2019.pdf) paper explores the attainable performance on CPU, while [this](https://kakadusoftware.com/wp-content/uploads/ICIP2019_GPU.pdf) and [this](https://webapps.unsworks.library.unsw.edu.au/fapi/datastream/unsworks:75139/bin990339e4-8805-4456-ae30-223d85f9b1c1) explores performance on the GPU. +The interested reader is referred to the [short HTJ2K white paper](http://ds.jpeg.org/whitepapers/jpeg-htj2k-whitepaper.pdf), or the [extended HTJ2K white paper](https://htj2k.com/wp-content/uploads/white-paper.pdf) for more details on HTJ2K. [This](https://kakadusoftware.com/wp-content/uploads/icip2019.pdf) paper explores the attainable performance on CPU, while [this](https://kakadusoftware.com/wp-content/uploads/ICIP2019_GPU.pdf) and [this](http://hdl.handle.net/1959.4/unsworks_75139) explores performance on the GPU. # The standard # From 82ee92fce015757789c1d370dcc5234071d73d55 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Wed, 23 Oct 2024 08:26:22 +1100 Subject: [PATCH 32/78] Starting adding support for up to 32bit lossless precision encoding --- src/apps/common/ojph_img_io.h | 2 +- src/core/codestream/ojph_codeblock.h | 2 +- src/core/codestream/ojph_codestream_local.h | 2 +- src/core/codestream/ojph_resolution.h | 2 +- src/core/codestream/ojph_subband.h | 2 +- src/core/codestream/ojph_tile.h | 2 +- src/core/codestream/ojph_tile_comp.h | 2 +- src/core/common/ojph_codestream.h | 2 +- src/core/common/ojph_mem.h | 24 ++++++++++++++--- src/core/transform/ojph_colour.cpp | 29 ++++++++++++--------- src/core/transform/ojph_colour.h | 28 ++++++++++---------- src/core/transform/ojph_transform.cpp | 4 ++- src/core/transform/ojph_transform.h | 5 +++- src/core/transform/ojph_transform_local.h | 5 +++- 14 files changed, 70 insertions(+), 41 deletions(-) diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h index 401ad658..9f7f3356 100644 --- a/src/apps/common/ojph_img_io.h +++ b/src/apps/common/ojph_img_io.h @@ -54,7 +54,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// // defined elsewhere class mem_fixed_allocator; - struct line_buf; + class line_buf; //////////////////////////////////////////////////////////////////////////// // diff --git a/src/core/codestream/ojph_codeblock.h b/src/core/codestream/ojph_codeblock.h index 2f7d8e78..fc42aee1 100644 --- a/src/core/codestream/ojph_codeblock.h +++ b/src/core/codestream/ojph_codeblock.h @@ -48,7 +48,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_elastic_allocator; class codestream; struct coded_lists; diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h index e6930d5f..3d036582 100644 --- a/src/core/codestream/ojph_codestream_local.h +++ b/src/core/codestream/ojph_codestream_local.h @@ -46,7 +46,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_fixed_allocator; class mem_elastic_allocator; class codestream; diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h index 635a4ced..61564557 100644 --- a/src/core/codestream/ojph_resolution.h +++ b/src/core/codestream/ojph_resolution.h @@ -45,7 +45,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_elastic_allocator; class codestream; diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h index 8cadae07..6d8af59f 100644 --- a/src/core/codestream/ojph_subband.h +++ b/src/core/codestream/ojph_subband.h @@ -45,7 +45,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_elastic_allocator; class codestream; diff --git a/src/core/codestream/ojph_tile.h b/src/core/codestream/ojph_tile.h index 4b542421..6b65a130 100644 --- a/src/core/codestream/ojph_tile.h +++ b/src/core/codestream/ojph_tile.h @@ -47,7 +47,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class codestream; namespace local { diff --git a/src/core/codestream/ojph_tile_comp.h b/src/core/codestream/ojph_tile_comp.h index def39e55..62b8fba2 100644 --- a/src/core/codestream/ojph_tile_comp.h +++ b/src/core/codestream/ojph_tile_comp.h @@ -48,7 +48,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class codestream; namespace local { diff --git a/src/core/common/ojph_codestream.h b/src/core/common/ojph_codestream.h index c28096ed..f7a80651 100644 --- a/src/core/common/ojph_codestream.h +++ b/src/core/common/ojph_codestream.h @@ -61,7 +61,7 @@ namespace ojph { class comment_exchange; class mem_fixed_allocator; struct point; - struct line_buf; + class line_buf; class outfile_base; class infile_base; diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h index d7497cdb..f58c8ada 100644 --- a/src/core/common/ojph_mem.h +++ b/src/core/common/ojph_mem.h @@ -132,9 +132,23 @@ namespace ojph { }; ///////////////////////////////////////////////////////////////////////////// - struct line_buf + class line_buf { - line_buf() : size(0), pre_size(0), i32(0) {} + public: + enum line_buf_type { + LFT_UNDEFINED = 0x00, // Type is undefined/uninitialized + // These flags reflects data size in bytes + LFT_BYTE = 0x01, // Set when data is 1 byte + LFT_SHORT = 0x02, // Set when data is 2 bytes + LFT_INTEGER = 0x04, // Set when data is 4 bytes + LFT_LONG = 0x08, // Set when data is 8 bytes + LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding + // Not all combinations are useful + LFT_SIZE_MASK = 0x0F, // To extract data size + }; + + public: + line_buf() : size(0), pre_size(0), flags(LFT_UNDEFINED), i32(0) {} template void pre_alloc(mem_fixed_allocator *p, size_t num_ele, ui32 pre_size) @@ -153,9 +167,11 @@ namespace ojph { size_t size; ui32 pre_size; + line_buf_type flags; union { - si32* i32; - float* f32; + si32* i32; // 32bit integer type, used for lossless compression + float* f32; // float type, used for lossy compression + void* p; // not type is associated with the pointer }; }; diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index ca96d2d1..6ef88caf 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -39,45 +39,50 @@ #include "ojph_defs.h" #include "ojph_arch.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include "ojph_colour_local.h" namespace ojph { + + // defined elsewhere + class line_buf; + namespace local { ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_si32_shftd) - (const si32 *sp, si32 *dp, int shift, ui32 width) = NULL; + (const line_buf* src, line_buf* dst, int shift, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_si32_nlt_type3) - (const si32* sp, si32* dp, int shift, ui32 width) = NULL; + (const line_buf* src, line_buf* dst, int shift, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float_shftd) - (const si32 *sp, float *dp, float mul, ui32 width) = NULL; + (const line_buf* src, line_buf* dst, float mul, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float) - (const si32 *sp, float *dp, float mul, ui32 width) = NULL; + (const line_buf* src, line_buf* dst, float mul, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_float_to_si32_shftd) - (const float *sp, si32 *dp, float mul, ui32 width) = NULL; + (const line_buf* sp, line_buf* dp, float mul, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_float_to_si32) - (const float *sp, si32 *dp, float mul, ui32 width) = NULL; + (const line_buf* sp, line_buf* dp, float mul, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*rct_forward) - (const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) = NULL; + (const line_buf* r, const line_buf* g, const line_buf* b, + line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL; ////////////////////////////////////////////////////////////////////////// void (*rct_backward) - (const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) = NULL; + (const line_buf* r, const line_buf* g, const line_buf* b, + line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL; ////////////////////////////////////////////////////////////////////////// void (*ict_forward) @@ -86,8 +91,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void (*ict_backward) - (const float *y, const float *cb, const float *cr, - float *r, float *g, float *b, ui32 repeat) = NULL; + (const line_buf* y, const line_buf* cb, const line_buf* cr, + line_buf* r, line_buf* g, line_buf* b, ui32 repeat) = NULL; ////////////////////////////////////////////////////////////////////////// static bool colour_transform_functions_initialized = false; diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h index 52df3123..a03759e7 100644 --- a/src/core/transform/ojph_colour.h +++ b/src/core/transform/ojph_colour.h @@ -47,47 +47,47 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_si32_to_si32_shftd) - (const si32 *sp, si32 *dp, int shift, ui32 width); + (const line_buf* sp, line_buf* dp, int shift, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_si32_to_si32_nlt_type3) - (const si32 *sp, si32 *dp, int shift, ui32 width); + (const line_buf *sp, line_buf *dp, int shift, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_si32_to_float_shftd) - (const si32 *sp, float *dp, float mul, ui32 width); + (const line_buf *sp, line_buf *dp, float mul, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_si32_to_float) - (const si32 *sp, float *dp, float mul, ui32 width); + (const line_buf *sp, line_buf *dp, float mul, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_float_to_si32_shftd) - (const float *sp, si32 *dp, float mul, ui32 width); + (const line_buf *sp, line_buf *dp, float mul, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_float_to_si32) - (const float *sp, si32 *dp, float mul, ui32 width); + (const line_buf *sp, line_buf *dp, float mul, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*rct_forward) - (const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + (const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); //////////////////////////////////////////////////////////////////////////// extern void (*rct_backward) - (const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + (const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); //////////////////////////////////////////////////////////////////////////// extern void (*ict_forward) - (const float *r, const float *g, const float *b, - float *y, float *cb, float *cr, ui32 repeat); + (const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); //////////////////////////////////////////////////////////////////////////// extern void (*ict_backward) - (const float *y, const float *cb, const float *cr, - float *r, float *g, float *b, ui32 repeat); + (const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); } } diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp index ee4bb084..09891541 100644 --- a/src/core/transform/ojph_transform.cpp +++ b/src/core/transform/ojph_transform.cpp @@ -45,7 +45,9 @@ #include "../codestream/ojph_params_local.h" namespace ojph { - struct line_buf; + + // defined elsewhere + class line_buf; namespace local { diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h index 0e59632e..f7576a1c 100644 --- a/src/core/transform/ojph_transform.h +++ b/src/core/transform/ojph_transform.h @@ -42,7 +42,10 @@ #include "ojph_defs.h" namespace ojph { - struct line_buf; + + // defined elsewhere + class line_buf; + namespace local { union lifting_step; struct param_atk; diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h index ec2a2e12..c139ca00 100644 --- a/src/core/transform/ojph_transform_local.h +++ b/src/core/transform/ojph_transform_local.h @@ -42,7 +42,10 @@ #include "ojph_defs.h" namespace ojph { - struct line_buf; + + // defined elsewhere + class line_buf; + namespace local { struct param_atk; union lifting_step; From 7e5e240f3f77e1243c09757700755ea62e16c1f9 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 29 Oct 2024 17:48:10 +1100 Subject: [PATCH 33/78] The front end is complete for generic code (not SIMD). --- src/core/codestream/ojph_tile.cpp | 91 +++++----- src/core/common/ojph_mem.h | 6 +- src/core/transform/ojph_colour.cpp | 224 ++++++++++++++++++++----- src/core/transform/ojph_colour.h | 32 ++-- src/core/transform/ojph_colour_local.h | 22 ++- 5 files changed, 263 insertions(+), 112 deletions(-) diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 281e1564..b1cb95a7 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -259,17 +259,15 @@ namespace ojph { line_buf *tc = comps[comp_num].get_line(); if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp = line->i32 + line_offsets[comp_num]; - si32* dp = tc->i32; - if (is_signed[comp_num]) { - if (nlt_type3[comp_num]) - cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); - else - memcpy(dp, sp, comp_width * sizeof(si32)); + si64 shift = 1LL << (num_bits[comp_num] - 1); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(line, line_offsets[comp_num], + tc, 0, shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : -shift; + rev_convert(line, line_offsets[comp_num], tc, 0, + shift, comp_width); } - else - cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); } else { @@ -285,26 +283,25 @@ namespace ojph { } else { - int shift = 1 << (num_bits[comp_num] - 1); + si64 shift = 1LL << (num_bits[comp_num] - 1); ui32 comp_width = comp_rects[comp_num].siz.w; if (reversible) { - const si32 *sp = line->i32 + line_offsets[comp_num]; - si32 *dp = lines[comp_num].i32; - if (is_signed[comp_num]) { - if (nlt_type3[comp_num]) - cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); - else - memcpy(dp, sp, comp_width * sizeof(si32)); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(line, line_offsets[comp_num], + lines + comp_num, 0, shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : -shift; + rev_convert(line, line_offsets[comp_num], lines + comp_num, 0, + shift, comp_width); } - else - cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); + if (comp_num == 2) { // reversible color transform - rct_forward(lines[0].i32, lines[1].i32, lines[2].i32, - comps[0].get_line()->i32, - comps[1].get_line()->i32, - comps[2].get_line()->i32, comp_width); + rct_forward(lines + 0, lines + 1, lines + 2, + comps[0].get_line(), + comps[1].get_line(), + comps[2].get_line(), comp_width); comps[0].push_line(); comps[1].push_line(); comps[2].push_line(); @@ -350,17 +347,15 @@ namespace ojph { ui32 comp_width = recon_comp_rects[comp_num].siz.w; if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp = src_line->i32; - si32* dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) { - if (nlt_type3[comp_num]) - cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); - else - memcpy(dp, sp, comp_width * sizeof(si32)); + si64 shift = 1LL << (num_bits[comp_num] - 1); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(src_line, 0, tgt_line, + line_offsets[comp_num], shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : shift; + rev_convert(src_line, 0, tgt_line, + line_offsets[comp_num], shift, comp_width); } - else - cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width); } else { @@ -380,9 +375,9 @@ namespace ojph { if (comp_num == 0) { if (reversible) - rct_backward(comps[0].pull_line()->i32, comps[1].pull_line()->i32, - comps[2].pull_line()->i32, lines[0].i32, lines[1].i32, - lines[2].i32, comp_width); + rct_backward(comps[0].pull_line(), comps[1].pull_line(), + comps[2].pull_line(), lines + 0, lines + 1, + lines + 2, comp_width); else ict_backward(comps[0].pull_line()->f32, comps[1].pull_line()->f32, comps[2].pull_line()->f32, lines[0].f32, lines[1].f32, @@ -390,21 +385,21 @@ namespace ojph { } if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp; + si64 shift = 1LL << (num_bits[comp_num] - 1); + line_buf* src_line; if (comp_num < 3) - sp = lines[comp_num].i32; + src_line = lines + comp_num; else - sp = comps[comp_num].pull_line()->i32; + src_line = comps[comp_num].pull_line(); si32* dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) { - if (nlt_type3[comp_num]) - cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); - else - memcpy(dp, sp, comp_width * sizeof(si32)); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(src_line, 0, tgt_line, + line_offsets[comp_num], shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : shift; + rev_convert(src_line, 0, tgt_line, + line_offsets[comp_num], shift, comp_width); } - else - cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width); } else { diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h index f58c8ada..abd5a0f4 100644 --- a/src/core/common/ojph_mem.h +++ b/src/core/common/ojph_mem.h @@ -139,9 +139,9 @@ namespace ojph { LFT_UNDEFINED = 0x00, // Type is undefined/uninitialized // These flags reflects data size in bytes LFT_BYTE = 0x01, // Set when data is 1 byte - LFT_SHORT = 0x02, // Set when data is 2 bytes - LFT_INTEGER = 0x04, // Set when data is 4 bytes - LFT_LONG = 0x08, // Set when data is 8 bytes + LFT_16BIT = 0x02, // Set when data is 2 bytes + LFT_32BIT = 0x04, // Set when data is 4 bytes + LFT_64BIT = 0x08, // Set when data is 8 bytes LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding // Not all combinations are useful LFT_SIZE_MASK = 0x0F, // To extract data size diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 6ef88caf..154b66b7 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -51,28 +51,32 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void (*cnvrt_si32_to_si32_shftd) - (const line_buf* src, line_buf* dst, int shift, ui32 width) = NULL; + void (*rev_convert) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// - void (*cnvrt_si32_to_si32_nlt_type3) - (const line_buf* src, line_buf* dst, int shift, ui32 width) = NULL; + void (*rev_convert_nlt_type3) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float_shftd) - (const line_buf* src, line_buf* dst, float mul, ui32 width) = NULL; + (const si32 *sp, float *dp, float mul, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float) - (const line_buf* src, line_buf* dst, float mul, ui32 width) = NULL; + (const si32 *sp, float *dp, float mul, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_float_to_si32_shftd) - (const line_buf* sp, line_buf* dp, float mul, ui32 width) = NULL; + (const float *sp, si32 *dp, float mul, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_float_to_si32) - (const line_buf* sp, line_buf* dp, float mul, ui32 width) = NULL; + (const float *sp, si32 *dp, float mul, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*rct_forward) @@ -91,8 +95,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void (*ict_backward) - (const line_buf* y, const line_buf* cb, const line_buf* cr, - line_buf* r, line_buf* g, line_buf* b, ui32 repeat) = NULL; + (const float *y, const float *cb, const float *cr, + float *r, float *g, float *b, ui32 repeat) = NULL; ////////////////////////////////////////////////////////////////////////// static bool colour_transform_functions_initialized = false; @@ -105,8 +109,10 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) - cnvrt_si32_to_si32_shftd = gen_cnvrt_si32_to_si32_shftd; - cnvrt_si32_to_si32_nlt_type3 = gen_cnvrt_si32_to_si32_nlt_type3; + // cnvrt_si32_to_si32_shftd = gen_cnvrt_si32_to_si32_shftd; + // cnvrt_si32_to_si32_nlt_type3 = gen_cnvrt_si32_to_si32_nlt_type3; + rev_convert = gen_rev_convert; + rev_convert_nlt_type3 = gen_rev_convert_nlt_type3; cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd; cnvrt_si32_to_float = gen_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd; @@ -137,10 +143,10 @@ namespace ojph { { cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd; cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32; - cnvrt_si32_to_si32_shftd = sse2_cnvrt_si32_to_si32_shftd; - cnvrt_si32_to_si32_nlt_type3 = sse2_cnvrt_si32_to_si32_nlt_type3; - rct_forward = sse2_rct_forward; - rct_backward = sse2_rct_backward; + // cnvrt_si32_to_si32_shftd = sse2_cnvrt_si32_to_si32_shftd; + // cnvrt_si32_to_si32_nlt_type3 = sse2_cnvrt_si32_to_si32_nlt_type3; + // rct_forward = sse2_rct_forward; + // rct_backward = sse2_rct_backward; } #endif // !OJPH_DISABLE_SSE2 @@ -159,10 +165,10 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { - cnvrt_si32_to_si32_shftd = avx2_cnvrt_si32_to_si32_shftd; - cnvrt_si32_to_si32_nlt_type3 = avx2_cnvrt_si32_to_si32_nlt_type3; - rct_forward = avx2_rct_forward; - rct_backward = avx2_rct_backward; + // cnvrt_si32_to_si32_shftd = avx2_cnvrt_si32_to_si32_shftd; + // cnvrt_si32_to_si32_nlt_type3 = avx2_cnvrt_si32_to_si32_nlt_type3; + // rct_forward = avx2_rct_forward; + // rct_backward = avx2_rct_backward; } #endif // !OJPH_DISABLE_AVX2 @@ -206,20 +212,78 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void gen_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) { - for (ui32 i = width; i > 0; --i) - *dp++ = *sp++ + shift; + if (src_line->flags | line_buf::LFT_32BIT) + { + if (dst_line->flags | line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + si32 s = (si32)shift; + for (ui32 i = width; i > 0; --i) + *dp++ = *sp++ + s; + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = (si64*)dst_line->p + dst_line_offset; + for (ui32 i = width; i > 0; --i) + *dp++ = *sp++ + shift; + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = (si64*)src_line->p + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + for (ui32 i = width; i > 0; --i) + *dp++ = (si32)(*sp++ + shift); + } } ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width) + void gen_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) { - for (ui32 i = width; i > 0; --i) { - const si32 v = *sp++; - *dp++ = v >= 0 ? v : (- v - shift); + if (src_line->flags | line_buf::LFT_32BIT) + { + if (dst_line->flags | line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + si32 s = (si32)shift; + for (ui32 i = width; i > 0; --i) { + const si32 v = *sp++; + *dp++ = v >= 0 ? v : (- v - s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = (si64*)dst_line->p + dst_line_offset; + for (ui32 i = width; i > 0; --i) { + const si64 v = *sp++; + *dp++ = v >= 0 ? v : (- v - shift); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = (si64*)src_line->p + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + for (ui32 i = width; i > 0; --i) { + const si64 v = *sp++; + *dp++ = (si32)(v >= 0 ? v : (- v - shift)); + } } } @@ -256,26 +320,104 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void gen_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { - for (ui32 i = repeat; i > 0; --i) + assert((y->flags | line_buf::LFT_REVERSIBLE) && + (cb->flags | line_buf::LFT_REVERSIBLE) && + (cr->flags | line_buf::LFT_REVERSIBLE) && + (r->flags | line_buf::LFT_REVERSIBLE) && + (g->flags | line_buf::LFT_REVERSIBLE) && + (b->flags | line_buf::LFT_REVERSIBLE)); + + if (y->flags | line_buf::LFT_32BIT) + { + assert((y->flags | line_buf::LFT_32BIT) && + (cb->flags | line_buf::LFT_32BIT) && + (cr->flags | line_buf::LFT_32BIT) && + (r->flags | line_buf::LFT_32BIT) && + (g->flags | line_buf::LFT_32BIT) && + (b->flags | line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + for (ui32 i = repeat; i > 0; --i) + { + si32 rr = *rp++, gg = *gp++, bb = *bp++; + *yp++ = (rr + (gg << 1) + bb) >> 2; + *cbp++ = (bb - gg); + *crp++ = (rr - gg); + } + } + else { - *y++ = (*r + (*g << 1) + *b) >> 2; - *cb++ = (*b++ - *g); - *cr++ = (*r++ - *g++); + assert((y->flags | line_buf::LFT_64BIT) && + (cb->flags | line_buf::LFT_64BIT) && + (cr->flags | line_buf::LFT_64BIT) && + (r->flags | line_buf::LFT_32BIT) && + (g->flags | line_buf::LFT_32BIT) && + (b->flags | line_buf::LFT_32BIT)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = (si64*)y->p, *cbp = (si64*)cb->p, *crp = (si64*)cr->p; + for (ui32 i = repeat; i > 0; --i) + { + si64 rr = *rp++, gg = *gp++, bb = *bp++; + *yp++ = (rr + (gg << 1) + bb) >> 2; + *cbp++ = (bb - gg); + *crp++ = (rr - gg); + } } } ////////////////////////////////////////////////////////////////////////// - void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) + void gen_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { - for (ui32 i = repeat; i > 0; --i) + assert((y->flags | line_buf::LFT_REVERSIBLE) && + (cb->flags | line_buf::LFT_REVERSIBLE) && + (cr->flags | line_buf::LFT_REVERSIBLE) && + (r->flags | line_buf::LFT_REVERSIBLE) && + (g->flags | line_buf::LFT_REVERSIBLE) && + (b->flags | line_buf::LFT_REVERSIBLE)); + + if (y->flags | line_buf::LFT_32BIT) { - *g = *y++ - ((*cb + *cr)>>2); - *b++ = *cb++ + *g; - *r++ = *cr++ + *g++; + assert((y->flags | line_buf::LFT_32BIT) && + (cb->flags | line_buf::LFT_32BIT) && + (cr->flags | line_buf::LFT_32BIT) && + (r->flags | line_buf::LFT_32BIT) && + (g->flags | line_buf::LFT_32BIT) && + (b->flags | line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (ui32 i = repeat; i > 0; --i) + { + si32 yy = *yp++, cbb = *cbp++, crr = *crp++; + si32 gg = yy - ((cbb + crr) >> 2); + *rp++ = crr + gg; + *gp++ = gg; + *bp++ = cbb + gg; + } + } + else + { + assert((y->flags | line_buf::LFT_64BIT) && + (cb->flags | line_buf::LFT_64BIT) && + (cr->flags | line_buf::LFT_64BIT) && + (r->flags | line_buf::LFT_32BIT) && + (g->flags | line_buf::LFT_32BIT) && + (b->flags | line_buf::LFT_32BIT)); + const si64 *yp = (si64*)y->p, *cbp = (si64*)cb->p, *crp = (si64*)cr->p; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (ui32 i = repeat; i > 0; --i) + { + si64 yy = *yp++, cbb = *cbp++, crr = *crp++; + si64 gg = yy - ((cbb + crr) >> 2); + *rp++ = (si32)(crr + gg); + *gp++ = (si32)gg; + *bp++ = (si32)(cbb + gg); + } } } diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h index a03759e7..cc42aaa5 100644 --- a/src/core/transform/ojph_colour.h +++ b/src/core/transform/ojph_colour.h @@ -40,34 +40,42 @@ #define OJPH_COLOR_H namespace ojph { + + // defined elsewhere + class line_buf; + namespace local { //////////////////////////////////////////////////////////////////////////// void init_colour_transform_functions(); //////////////////////////////////////////////////////////////////////////// - extern void (*cnvrt_si32_to_si32_shftd) - (const line_buf* sp, line_buf* dp, int shift, ui32 width); + extern void (*rev_convert) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); //////////////////////////////////////////////////////////////////////////// - extern void (*cnvrt_si32_to_si32_nlt_type3) - (const line_buf *sp, line_buf *dp, int shift, ui32 width); + extern void (*rev_convert_nlt_type3) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_si32_to_float_shftd) - (const line_buf *sp, line_buf *dp, float mul, ui32 width); + (const si32 *sp, float *dp, float mul, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_si32_to_float) - (const line_buf *sp, line_buf *dp, float mul, ui32 width); + (const si32 *sp, float *dp, float mul, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_float_to_si32_shftd) - (const line_buf *sp, line_buf *dp, float mul, ui32 width); + (const float *sp, si32 *dp, float mul, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_float_to_si32) - (const line_buf *sp, line_buf *dp, float mul, ui32 width); + (const float *sp, si32 *dp, float mul, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*rct_forward) @@ -81,13 +89,13 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// extern void (*ict_forward) - (const line_buf *r, const line_buf *g, const line_buf *b, - line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); + (const float *r, const float *g, const float *b, + float *y, float *cb, float *cr, ui32 repeat); //////////////////////////////////////////////////////////////////////////// extern void (*ict_backward) - (const line_buf *y, const line_buf *cb, const line_buf *cr, - line_buf *r, line_buf *g, line_buf *b, ui32 repeat); + (const float *y, const float *cb, const float *cr, + float *r, float *g, float *b, ui32 repeat); } } diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index ae5eba1b..08e99a92 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -65,12 +65,16 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void gen_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width); + void gen_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, @@ -89,12 +93,14 @@ namespace ojph { ui32 width); ////////////////////////////////////////////////////////////////////////// - void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void gen_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void gen_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// void gen_ict_forward(const float *r, const float *g, const float *b, From 5115fb52456035b0edb58236bc615de2db2d1be8 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 29 Oct 2024 17:53:25 +1100 Subject: [PATCH 34/78] Fixes warnings. --- src/core/codestream/ojph_tile.cpp | 1 - src/core/transform/ojph_colour.cpp | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index b1cb95a7..99fe75d9 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -391,7 +391,6 @@ namespace ojph { src_line = lines + comp_num; else src_line = comps[comp_num].pull_line(); - si32* dp = tgt_line->i32 + line_offsets[comp_num]; if (is_signed[comp_num] && nlt_type3[comp_num]) rev_convert_nlt_type3(src_line, 0, tgt_line, line_offsets[comp_num], shift + 1, comp_width); diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 154b66b7..608ee9ae 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -217,9 +217,9 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width) { - if (src_line->flags | line_buf::LFT_32BIT) + if ((src_line->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) { - if (dst_line->flags | line_buf::LFT_32BIT) + if ((dst_line->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; @@ -252,9 +252,9 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width) { - if (src_line->flags | line_buf::LFT_32BIT) + if ((src_line->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) { - if (dst_line->flags | line_buf::LFT_32BIT) + if ((dst_line->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; @@ -331,7 +331,7 @@ namespace ojph { (g->flags | line_buf::LFT_REVERSIBLE) && (b->flags | line_buf::LFT_REVERSIBLE)); - if (y->flags | line_buf::LFT_32BIT) + if ((y->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) { assert((y->flags | line_buf::LFT_32BIT) && (cb->flags | line_buf::LFT_32BIT) && @@ -381,7 +381,7 @@ namespace ojph { (g->flags | line_buf::LFT_REVERSIBLE) && (b->flags | line_buf::LFT_REVERSIBLE)); - if (y->flags | line_buf::LFT_32BIT) + if ((y->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) { assert((y->flags | line_buf::LFT_32BIT) && (cb->flags | line_buf::LFT_32BIT) && From 4d47c404298b206862139caa1a71fcb2a3f06dbc Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 29 Oct 2024 17:56:24 +1100 Subject: [PATCH 35/78] A bug fix. --- src/core/transform/ojph_colour.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 608ee9ae..832d994b 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -217,9 +217,9 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width) { - if ((src_line->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) + if (src_line->flags & line_buf::LFT_32BIT) { - if ((dst_line->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) + if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; @@ -252,9 +252,9 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width) { - if ((src_line->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) + if (src_line->flags & line_buf::LFT_32BIT) { - if ((dst_line->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) + if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; @@ -331,7 +331,7 @@ namespace ojph { (g->flags | line_buf::LFT_REVERSIBLE) && (b->flags | line_buf::LFT_REVERSIBLE)); - if ((y->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) + if (y->flags & line_buf::LFT_32BIT) { assert((y->flags | line_buf::LFT_32BIT) && (cb->flags | line_buf::LFT_32BIT) && @@ -381,7 +381,7 @@ namespace ojph { (g->flags | line_buf::LFT_REVERSIBLE) && (b->flags | line_buf::LFT_REVERSIBLE)); - if ((y->flags | line_buf::LFT_32BIT) == line_buf::LFT_32BIT) + if (y->flags & line_buf::LFT_32BIT) { assert((y->flags | line_buf::LFT_32BIT) && (cb->flags | line_buf::LFT_32BIT) && From 186e3edf7bd8cbcbd4d45efd6d91a5ef01686e3d Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 29 Oct 2024 18:59:12 +1100 Subject: [PATCH 36/78] A bug fix. --- src/core/common/ojph_mem.h | 4 +- src/core/transform/ojph_colour.cpp | 72 +++++++++++++++--------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h index abd5a0f4..2f074a42 100644 --- a/src/core/common/ojph_mem.h +++ b/src/core/common/ojph_mem.h @@ -135,7 +135,7 @@ namespace ojph { class line_buf { public: - enum line_buf_type { + enum : ui32 { LFT_UNDEFINED = 0x00, // Type is undefined/uninitialized // These flags reflects data size in bytes LFT_BYTE = 0x01, // Set when data is 1 byte @@ -167,7 +167,7 @@ namespace ojph { size_t size; ui32 pre_size; - line_buf_type flags; + ui32 flags; union { si32* i32; // 32bit integer type, used for lossless compression float* f32; // float type, used for lossy compression diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 832d994b..6dde7b63 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -324,21 +324,21 @@ namespace ojph { const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { - assert((y->flags | line_buf::LFT_REVERSIBLE) && - (cb->flags | line_buf::LFT_REVERSIBLE) && - (cr->flags | line_buf::LFT_REVERSIBLE) && - (r->flags | line_buf::LFT_REVERSIBLE) && - (g->flags | line_buf::LFT_REVERSIBLE) && - (b->flags | line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); if (y->flags & line_buf::LFT_32BIT) { - assert((y->flags | line_buf::LFT_32BIT) && - (cb->flags | line_buf::LFT_32BIT) && - (cr->flags | line_buf::LFT_32BIT) && - (r->flags | line_buf::LFT_32BIT) && - (g->flags | line_buf::LFT_32BIT) && - (b->flags | line_buf::LFT_32BIT)); + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; for (ui32 i = repeat; i > 0; --i) @@ -351,12 +351,12 @@ namespace ojph { } else { - assert((y->flags | line_buf::LFT_64BIT) && - (cb->flags | line_buf::LFT_64BIT) && - (cr->flags | line_buf::LFT_64BIT) && - (r->flags | line_buf::LFT_32BIT) && - (g->flags | line_buf::LFT_32BIT) && - (b->flags | line_buf::LFT_32BIT)); + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; si64 *yp = (si64*)y->p, *cbp = (si64*)cb->p, *crp = (si64*)cr->p; for (ui32 i = repeat; i > 0; --i) @@ -374,21 +374,21 @@ namespace ojph { const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { - assert((y->flags | line_buf::LFT_REVERSIBLE) && - (cb->flags | line_buf::LFT_REVERSIBLE) && - (cr->flags | line_buf::LFT_REVERSIBLE) && - (r->flags | line_buf::LFT_REVERSIBLE) && - (g->flags | line_buf::LFT_REVERSIBLE) && - (b->flags | line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); if (y->flags & line_buf::LFT_32BIT) { - assert((y->flags | line_buf::LFT_32BIT) && - (cb->flags | line_buf::LFT_32BIT) && - (cr->flags | line_buf::LFT_32BIT) && - (r->flags | line_buf::LFT_32BIT) && - (g->flags | line_buf::LFT_32BIT) && - (b->flags | line_buf::LFT_32BIT)); + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; for (ui32 i = repeat; i > 0; --i) @@ -402,12 +402,12 @@ namespace ojph { } else { - assert((y->flags | line_buf::LFT_64BIT) && - (cb->flags | line_buf::LFT_64BIT) && - (cr->flags | line_buf::LFT_64BIT) && - (r->flags | line_buf::LFT_32BIT) && - (g->flags | line_buf::LFT_32BIT) && - (b->flags | line_buf::LFT_32BIT)); + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); const si64 *yp = (si64*)y->p, *cbp = (si64*)cb->p, *crp = (si64*)cr->p; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; for (ui32 i = repeat; i > 0; --i) From 6993317e1c240ad089f3f529351976e88abb9fd0 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Wed, 30 Oct 2024 21:43:42 +1100 Subject: [PATCH 37/78] Code supports 64 bits, no SIMD, no codeblock encoding/decoding --- src/core/codestream/ojph_codeblock.cpp | 150 +++++-- src/core/codestream/ojph_codeblock.h | 19 +- src/core/codestream/ojph_codeblock_fun.cpp | 167 +++++--- src/core/codestream/ojph_codeblock_fun.h | 48 ++- src/core/codestream/ojph_codestream_avx.cpp | 2 +- src/core/codestream/ojph_codestream_avx2.cpp | 18 +- src/core/codestream/ojph_codestream_gen.cpp | 89 +++- src/core/codestream/ojph_codestream_sse.cpp | 2 +- src/core/codestream/ojph_codestream_sse2.cpp | 18 +- src/core/codestream/ojph_params.cpp | 16 + src/core/codestream/ojph_params_local.h | 4 + src/core/codestream/ojph_resolution.cpp | 42 +- src/core/codestream/ojph_subband.cpp | 25 +- src/core/codestream/ojph_subband.h | 2 + src/core/codestream/ojph_tile.cpp | 3 +- src/core/coding/ojph_block_decoder.cpp | 21 +- src/core/coding/ojph_block_decoder.h | 7 +- src/core/coding/ojph_block_encoder.cpp | 19 +- src/core/coding/ojph_block_encoder.h | 17 +- src/core/common/ojph_mem.h | 3 +- src/core/others/ojph_mem.cpp | 24 +- src/core/transform/ojph_colour.cpp | 12 +- src/core/transform/ojph_transform.cpp | 406 +++++++++++++++---- 23 files changed, 851 insertions(+), 263 deletions(-) diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp index 09159513..45c91416 100644 --- a/src/core/codestream/ojph_codeblock.cpp +++ b/src/core/codestream/ojph_codeblock.cpp @@ -45,6 +45,7 @@ #include "ojph_codestream_local.h" #include "ojph_codeblock.h" #include "ojph_subband.h" +#include "ojph_resolution.h" namespace ojph { @@ -52,7 +53,7 @@ namespace ojph { { ////////////////////////////////////////////////////////////////////////// - void codeblock::pre_alloc(codestream *codestream, + void codeblock::pre_alloc(codestream *codestream, ui32 comp_num, const size& nominal) { mem_fixed_allocator* allocator = codestream->get_allocator(); @@ -60,7 +61,14 @@ namespace ojph { assert(byte_alignment / sizeof(ui32) > 1); const ui32 f = byte_alignment / sizeof(ui32) - 1; ui32 stride = (nominal.w + f) & ~f; // a multiple of 8 - allocator->pre_alloc_data(nominal.h * stride, 0); + + const param_siz* sz = codestream->get_siz(); + const param_cod* cd = codestream->get_cod(comp_num); + ui32 bit_depth = cd->propose_implementation_precision(sz); + if (bit_depth <= 32) + allocator->pre_alloc_data(nominal.h * stride, 0); + else + allocator->pre_alloc_data(nominal.h * stride, 0); } ////////////////////////////////////////////////////////////////////////// @@ -75,7 +83,19 @@ namespace ojph { const ui32 f = byte_alignment / sizeof(ui32) - 1; this->stride = (nominal.w + f) & ~f; // a multiple of 8 this->buf_size = this->stride * nominal.h; - this->buf = allocator->post_alloc_data(this->buf_size, 0); + + ui32 comp_num = parent->get_parent()->get_comp_num(); + const param_siz* sz = codestream->get_siz(); + const param_cod* cd = codestream->get_cod(comp_num); + ui32 bit_depth = cd->propose_implementation_precision(sz); + if (bit_depth <= 32) { + precision = BUF32; + this->buf32 = allocator->post_alloc_data(this->buf_size, 0); + } + else { + precision = BUF64; + this->buf64 = allocator->post_alloc_data(this->buf_size, 0); + } this->nominal_size = nominal; this->cb_size = cb_size; @@ -85,8 +105,8 @@ namespace ojph { this->delta = parent->get_delta(); this->delta_inv = 1.0f / this->delta; this->K_max = K_max; - for (int i = 0; i < 8; ++i) - this->max_val[i] = 0; + for (int i = 0; i < 4; ++i) + this->max_val64[i] = 0; ojph::param_cod cod = codestream->access_cod(); this->reversible = cod.is_reversible(); this->resilient = codestream->is_resilient(); @@ -100,28 +120,61 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void codeblock::push(line_buf *line) { - // convert to sign and magnitude and keep max_val - const si32 *sp = line->i32 + line_offset; - ui32 *dp = buf + cur_line * stride; - this->codeblock_functions.tx_to_cb(sp, dp, K_max, delta_inv, cb_size.w, - max_val); - ++cur_line; + // convert to sign and magnitude and keep max_val + if (precision == BUF32) + { + assert(line->flags & line_buf::LFT_32BIT); + const si32 *sp = line->i32 + line_offset; + ui32 *dp = buf32 + cur_line * stride; + this->codeblock_functions.tx_to_cb32(sp, dp, K_max, delta_inv, + cb_size.w, max_val32); + ++cur_line; + } + else + { + assert(precision == BUF64); + assert(line->flags & line_buf::LFT_64BIT); + const si64 *sp = line->i64 + line_offset; + ui64 *dp = buf64 + cur_line * stride; + this->codeblock_functions.tx_to_cb64(sp, dp, K_max, delta_inv, + cb_size.w, max_val64); + ++cur_line; + } } ////////////////////////////////////////////////////////////////////////// void codeblock::encode(mem_elastic_allocator *elastic) { - ui32 mv = this->codeblock_functions.find_max_val(max_val); - if (mv >= 1u<<(31 - K_max)) + if (precision == BUF32) + { + ui32 mv = this->codeblock_functions.find_max_val32(max_val32); + if (mv >= 1u << (31 - K_max)) + { + coded_cb->missing_msbs = K_max - 1; + assert(coded_cb->missing_msbs > 0); + assert(coded_cb->missing_msbs < K_max); + coded_cb->num_passes = 1; + + this->codeblock_functions.encode_cb32(buf32, K_max-1, 1, + cb_size.w, cb_size.h, stride, coded_cb->pass_length, + elastic, coded_cb->next_coded); + } + } + else { - coded_cb->missing_msbs = K_max - 1; - assert(coded_cb->missing_msbs > 0); - assert(coded_cb->missing_msbs < K_max); - coded_cb->num_passes = 1; - - this->codeblock_functions.encode_cb(buf, K_max-1, 1, - cb_size.w, cb_size.h, stride, coded_cb->pass_length, - elastic, coded_cb->next_coded); + assert(precision == BUF64); + ui64 mv = this->codeblock_functions.find_max_val64(max_val64); + if (mv >= 1ULL << (63 - K_max)) + { + coded_cb->missing_msbs = K_max - 1; + assert(coded_cb->missing_msbs > 0); + assert(coded_cb->missing_msbs < K_max); + coded_cb->num_passes = 1; + + this->codeblock_functions.encode_cb64(buf64, K_max-1, 1, + cb_size.w, cb_size.h, stride, coded_cb->pass_length, + elastic, coded_cb->next_coded); + } } } @@ -132,8 +185,8 @@ namespace ojph { this->cb_size = cb_size; this->coded_cb = coded_cb; this->cur_line = 0; - for (int i = 0; i < 8; ++i) - this->max_val[i] = 0; + for (int i = 0; i < 4; ++i) + this->max_val64[i] = 0; this->zero_block = false; } @@ -143,11 +196,24 @@ namespace ojph { if (coded_cb->pass_length[0] > 0 && coded_cb->num_passes > 0 && coded_cb->next_coded != NULL) { - bool result = this->codeblock_functions.decode_cb( + bool result; + if (precision == BUF32) + { + result = this->codeblock_functions.decode_cb32( + coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size, + buf32, coded_cb->missing_msbs, coded_cb->num_passes, + coded_cb->pass_length[0], coded_cb->pass_length[1], + cb_size.w, cb_size.h, stride, stripe_causal); + } + else + { + assert(precision == BUF64); + result = this->codeblock_functions.decode_cb64( coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size, - buf, coded_cb->missing_msbs, coded_cb->num_passes, + buf64, coded_cb->missing_msbs, coded_cb->num_passes, coded_cb->pass_length[0], coded_cb->pass_length[1], cb_size.w, cb_size.h, stride, stripe_causal); + } if (result == false) { @@ -167,15 +233,37 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void codeblock::pull_line(line_buf *line) { - si32 *dp = line->i32 + line_offset; - if (!zero_block) + //convert to sign and magnitude + if (precision == BUF32) { - //convert to sign and magnitude - const ui32 *sp = buf + cur_line * stride; - this->codeblock_functions.tx_from_cb(sp, dp, K_max, delta, cb_size.w); + assert(line->flags & line_buf::LFT_32BIT); + si32 *dp = line->i32 + line_offset; + if (!zero_block) + { + const ui32 *sp = buf32 + cur_line * stride; + this->codeblock_functions.tx_from_cb32(sp, dp, K_max, delta, + cb_size.w); + } + else + this->codeblock_functions.mem_clear32(dp, cb_size.w * sizeof(ui32)); } else - this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp)); + { + assert(precision == BUF64); + assert(line->flags & line_buf::LFT_64BIT); + si64 *dp = line->i64 + line_offset; + if (!zero_block) + { + const ui64 *sp = buf64 + cur_line * stride; + this->codeblock_functions.tx_from_cb64(sp, dp, K_max, delta, + cb_size.w); + } + else + this->codeblock_functions.mem_clear64(dp, cb_size.w * sizeof(*dp)); + + + } + ++cur_line; assert(cur_line <= cb_size.h); } diff --git a/src/core/codestream/ojph_codeblock.h b/src/core/codestream/ojph_codeblock.h index fc42aee1..fde8e6ac 100644 --- a/src/core/codestream/ojph_codeblock.h +++ b/src/core/codestream/ojph_codeblock.h @@ -65,8 +65,14 @@ namespace ojph { class codeblock { friend struct precinct; + enum : ui32 { + BUF32 = 4, + BUF64 = 8, + }; + public: - static void pre_alloc(codestream *codestream, const size& nominal); + static void pre_alloc(codestream *codestream, ui32 comp_num, + const size& nominal); void finalize_alloc(codestream *codestream, subband* parent, const size& nominal, const size& cb_size, coded_cb_header* coded_cb, @@ -79,7 +85,11 @@ namespace ojph { void pull_line(line_buf *line); private: - ui32* buf; + ui32 precision; + union { + ui32* buf32; + ui64* buf64; + }; size nominal_size; size cb_size; ui32 stride; @@ -93,7 +103,10 @@ namespace ojph { bool resilient; bool stripe_causal; bool zero_block; // true when the decoded block is all zero - ui32 max_val[8]; // supports up to 256 bits + union { + ui32 max_val32[8]; // supports up to 256 bits + ui64 max_val64[4]; // supports up to 256 bits + }; coded_cb_header* coded_cb; codeblock_fun codeblock_functions; }; diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index 51253c1b..8cc034a7 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -57,72 +57,109 @@ namespace ojph { { ////////////////////////////////////////////////////////////////////////// - void gen_mem_clear(void* addr, size_t count); - void sse_mem_clear(void* addr, size_t count); - void avx_mem_clear(void* addr, size_t count); - void wasm_mem_clear(void* addr, size_t count); + void gen_mem_clear32(si32* addr, size_t count); + void sse_mem_clear32(si32* addr, size_t count); + void avx_mem_clear32(si32* addr, size_t count); + void wasm_mem_clear32(si32* addr, size_t count); - ////////////////////////////////////////////////////////////////////////// - ui32 gen_find_max_val(ui32* address); - ui32 sse2_find_max_val(ui32* address); - ui32 avx2_find_max_val(ui32* address); - ui32 wasm_find_max_val(ui32* address); + void gen_mem_clear64(si64* addr, size_t count); + void sse_mem_clear64(si64* addr, size_t count); + void avx_mem_clear64(si64* addr, size_t count); + void wasm_mem_clear64(si64* addr, size_t count); ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); + ui32 gen_find_max_val32(ui32* address); + ui32 sse2_find_max_val32(ui32* address); + ui32 avx2_find_max_val32(ui32* address); + ui32 wasm_find_max_val32(ui32* address); + ui64 gen_find_max_val64(ui64* address); + ui64 sse2_find_max_val64(ui64* address); + ui64 avx2_find_max_val64(ui64* address); + ui64 wasm_find_max_val64(ui64* address); + ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); + void gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + + void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + ////////////////////////////////////////////////////////////////////////// + void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + + void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); void codeblock_fun::init(bool reversible) { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) // Default path, no acceleration. We may change this later - decode_cb = ojph_decode_codeblock; - find_max_val = gen_find_max_val; - mem_clear = gen_mem_clear; + decode_cb32 = ojph_decode_codeblock32; + find_max_val32 = gen_find_max_val32; + mem_clear32 = gen_mem_clear32; if (reversible) { - tx_to_cb = gen_rev_tx_to_cb; - tx_from_cb = gen_rev_tx_from_cb; + tx_to_cb32 = gen_rev_tx_to_cb32; + tx_from_cb32 = gen_rev_tx_from_cb32; } else { - tx_to_cb = gen_irv_tx_to_cb; - tx_from_cb = gen_irv_tx_from_cb; + tx_to_cb32 = gen_irv_tx_to_cb32; + tx_from_cb32 = gen_irv_tx_from_cb32; } - encode_cb = ojph_encode_codeblock; + encode_cb32 = ojph_encode_codeblock32; + + decode_cb64 = ojph_decode_codeblock64; + find_max_val64 = gen_find_max_val64; + mem_clear64 = gen_mem_clear64; + if (reversible) { + tx_to_cb64 = gen_rev_tx_to_cb64; + tx_from_cb64 = gen_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; + } + encode_cb64 = ojph_encode_codeblock64; #ifndef OJPH_DISABLE_SIMD @@ -131,52 +168,52 @@ namespace ojph { // Accelerated functions for INTEL/AMD CPUs #ifndef OJPH_DISABLE_SSE if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) - mem_clear = sse_mem_clear; + mem_clear32 = sse_mem_clear32; #endif // !OJPH_DISABLE_SSE #ifndef OJPH_DISABLE_SSE2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) { - find_max_val = sse2_find_max_val; + find_max_val32 = sse2_find_max_val32; if (reversible) { - tx_to_cb = sse2_rev_tx_to_cb; - tx_from_cb = sse2_rev_tx_from_cb; + tx_to_cb32 = sse2_rev_tx_to_cb32; + tx_from_cb32 = sse2_rev_tx_from_cb32; } else { - tx_to_cb = sse2_irv_tx_to_cb; - tx_from_cb = sse2_irv_tx_from_cb; + tx_to_cb32 = sse2_irv_tx_to_cb32; + tx_from_cb32 = sse2_irv_tx_from_cb32; } } #endif // !OJPH_DISABLE_SSE2 #ifndef OJPH_DISABLE_SSSE3 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSSE3) - decode_cb = ojph_decode_codeblock_ssse3; + decode_cb32 = ojph_decode_codeblock_ssse3; #endif // !OJPH_DISABLE_SSSE3 #ifndef OJPH_DISABLE_AVX if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX) - mem_clear = avx_mem_clear; + mem_clear32 = avx_mem_clear32; #endif // !OJPH_DISABLE_AVX #ifndef OJPH_DISABLE_AVX2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { - find_max_val = avx2_find_max_val; + find_max_val32 = avx2_find_max_val32; if (reversible) { - tx_to_cb = avx2_rev_tx_to_cb; - tx_from_cb = avx2_rev_tx_from_cb; + tx_to_cb32 = avx2_rev_tx_to_cb32; + tx_from_cb32 = avx2_rev_tx_from_cb32; } else { - tx_to_cb = avx2_irv_tx_to_cb; - tx_from_cb = avx2_irv_tx_from_cb; + tx_to_cb32 = avx2_irv_tx_to_cb32; + tx_from_cb32 = avx2_irv_tx_from_cb32; } - encode_cb = ojph_encode_codeblock_avx2; - decode_cb = ojph_decode_codeblock_avx2; + encode_cb32 = ojph_encode_codeblock_avx2; + decode_cb32 = ojph_decode_codeblock_avx2; } #endif // !OJPH_DISABLE_AVX2 #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) - encode_cb = ojph_encode_codeblock_avx512; + encode_cb32 = ojph_encode_codeblock_avx512; #endif // !OJPH_DISABLE_AVX512 #elif defined(OJPH_ARCH_ARM) diff --git a/src/core/codestream/ojph_codeblock_fun.h b/src/core/codestream/ojph_codeblock_fun.h index 679b2d34..03b3b243 100644 --- a/src/core/codestream/ojph_codeblock_fun.h +++ b/src/core/codestream/ojph_codeblock_fun.h @@ -48,26 +48,44 @@ namespace ojph { namespace local { // define function signature simple memory clearing - typedef void (*mem_clear_fun)(void* addr, size_t count); + typedef void (*mem_clear_fun32)(si32* addr, size_t count); + typedef void (*mem_clear_fun64)(si64* addr, size_t count); // define function signature for max value finding - typedef ui32 (*find_max_val_fun)(ui32* addr); + typedef ui32 (*find_max_val_fun32)(ui32* addr); + + typedef ui64 (*find_max_val_fun64)(ui64* addr); // define line transfer function signature from subbands to codeblocks - typedef void (*tx_to_cb_fun)(const void *sp, ui32 *dp, ui32 K_max, + typedef void (*tx_to_cb_fun32)(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32* max_val); + typedef void (*tx_to_cb_fun64)(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + // define line transfer function signature from codeblock to subband - typedef void (*tx_from_cb_fun)(const ui32 *sp, void *dp, ui32 K_max, + typedef void (*tx_from_cb_fun32)(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + + typedef void (*tx_from_cb_fun64)(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count); // define the block decoder function signature - typedef bool (*cb_decoder_fun)(ui8* coded_data, ui32* decoded_data, + typedef bool (*cb_decoder_fun32)(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, bool stripe_causal); + + typedef bool (*cb_decoder_fun64)(ui8* coded_data, ui64* decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal); // define the block encoder function signature - typedef void (*cb_encoder_fun)(ui32* buf, ui32 missing_msbs, + typedef void (*cb_encoder_fun32)(ui32* buf, ui32 missing_msbs, + ui32 num_passes, ui32 width, ui32 height, ui32 stride, + ui32* lengths, ojph::mem_elastic_allocator* elastic, + ojph::coded_lists*& coded); + + typedef void (*cb_encoder_fun64)(ui64* buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32* lengths, ojph::mem_elastic_allocator* elastic, ojph::coded_lists*& coded); @@ -78,22 +96,28 @@ namespace ojph { void init(bool reversible); // a pointer to the max value finding function - mem_clear_fun mem_clear; + mem_clear_fun32 mem_clear32; + mem_clear_fun64 mem_clear64; // a pointer to the max value finding function - find_max_val_fun find_max_val; + find_max_val_fun32 find_max_val32; + find_max_val_fun64 find_max_val64; // a pointer to function transferring samples from subbands to codeblocks - tx_to_cb_fun tx_to_cb; + tx_to_cb_fun32 tx_to_cb32; + tx_to_cb_fun64 tx_to_cb64; // a pointer to function transferring samples from codeblocks to subbands - tx_from_cb_fun tx_from_cb; + tx_from_cb_fun32 tx_from_cb32; + tx_from_cb_fun64 tx_from_cb64; // a pointer to the decoder function - cb_decoder_fun decode_cb; + cb_decoder_fun32 decode_cb32; + cb_decoder_fun64 decode_cb64; // a pointer to the encoder function - cb_encoder_fun encode_cb; + cb_encoder_fun32 encode_cb32; + cb_encoder_fun64 encode_cb64; }; } diff --git a/src/core/codestream/ojph_codestream_avx.cpp b/src/core/codestream/ojph_codestream_avx.cpp index 4c6d678d..22405c7e 100644 --- a/src/core/codestream/ojph_codestream_avx.cpp +++ b/src/core/codestream/ojph_codestream_avx.cpp @@ -42,7 +42,7 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void avx_mem_clear(void* addr, size_t count) + void avx_mem_clear32(si32* addr, size_t count) { float* p = (float*)addr; __m256 zero = _mm256_setzero_ps(); diff --git a/src/core/codestream/ojph_codestream_avx2.cpp b/src/core/codestream/ojph_codestream_avx2.cpp index 04a81ed0..bd849b59 100644 --- a/src/core/codestream/ojph_codestream_avx2.cpp +++ b/src/core/codestream/ojph_codestream_avx2.cpp @@ -42,7 +42,7 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - ui32 avx2_find_max_val(ui32* address) + ui32 avx2_find_max_val32(ui32* address) { __m128i x0 = _mm_loadu_si128((__m128i*)address); __m128i x1 = _mm_loadu_si128((__m128i*)address + 1); @@ -56,8 +56,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(delta_inv); @@ -80,8 +80,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(K_max); @@ -106,8 +106,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; @@ -124,8 +124,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF); diff --git a/src/core/codestream/ojph_codestream_gen.cpp b/src/core/codestream/ojph_codestream_gen.cpp index 466f4835..50fc878d 100644 --- a/src/core/codestream/ojph_codestream_gen.cpp +++ b/src/core/codestream/ojph_codestream_gen.cpp @@ -42,20 +42,29 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void gen_mem_clear(void* addr, size_t count) + void gen_mem_clear32(si32* addr, size_t count) { - ui32* p = (ui32*)addr; - for (size_t i = 0; i < count; i += 4, p += 1) - *p = 0; + for (size_t i = 0; i < count; i += 4) + *addr++ = 0; } ////////////////////////////////////////////////////////////////////////// - ui32 gen_find_max_val(ui32* addr) { return addr[0]; } + void gen_mem_clear64(si64* addr, size_t count) + { + for (size_t i = 0; i < count; i += 8) + *addr++ = 0; + } + + ////////////////////////////////////////////////////////////////////////// + ui32 gen_find_max_val32(ui32* addr) { return addr[0]; } ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, - ui32* max_val) + ui64 gen_find_max_val64(ui64* addr) { return addr[0]; } + + ////////////////////////////////////////////////////////////////////////// + void gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, + ui32* max_val) { ojph_unused(delta_inv); ui32 shift = 31 - K_max; @@ -65,7 +74,7 @@ namespace ojph { for (ui32 i = count; i > 0; --i) { si32 v = *p++; - ui32 sign = v >= 0 ? 0 : 0x80000000; + ui32 sign = v >= 0 ? 0U : 0x80000000U; ui32 val = (ui32)(v >= 0 ? v : -v); val <<= shift; *dp++ = sign | val; @@ -75,9 +84,31 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, - ui32* max_val) + void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, + ui64* max_val) + { + ojph_unused(delta_inv); + ui32 shift = 63 - K_max; + // convert to sign and magnitude and keep max_val + ui64 tmax = *max_val; + si64 *p = (si64*)sp; + for (ui32 i = count; i > 0; --i) + { + si64 v = *p++; + ui64 sign = v >= 0 ? 0ULL : 0x8000000000000000ULL; + ui64 val = (ui64)(v >= 0 ? v : -v); + val <<= shift; + *dp++ = sign | val; + tmax |= val; // it is more efficient to use or than max + } + *max_val = tmax; + } + + ////////////////////////////////////////////////////////////////////////// + void gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, + ui32* max_val) { ojph_unused(K_max); //quantize and convert to sign and magnitude and keep max_val @@ -87,7 +118,7 @@ namespace ojph { { float v = *p++; si32 t = ojph_trunc(v * delta_inv); - ui32 sign = t >= 0 ? 0 : 0x80000000; + ui32 sign = t >= 0 ? 0U : 0x80000000U; ui32 val = (ui32)(t >= 0 ? t : -t); *dp++ = sign | val; tmax |= val; // it is more efficient to use or than max @@ -96,8 +127,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; @@ -106,14 +137,30 @@ namespace ojph { for (ui32 i = count; i > 0; --i) { ui32 v = *sp++; - si32 val = (v & 0x7FFFFFFF) >> shift; - *p++ = (v & 0x80000000) ? -val : val; + si32 val = (v & 0x7FFFFFFFU) >> shift; + *p++ = (v & 0x80000000U) ? -val : val; + } + } + + ////////////////////////////////////////////////////////////////////////// + void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + //convert to sign and magnitude + si64 *p = (si64*)dp; + for (ui32 i = count; i > 0; --i) + { + ui64 v = *sp++; + si64 val = (v & 0x7FFFFFFFFFFFFFFFULL) >> shift; + *p++ = (v & 0x8000000000000000ULL) ? -val : val; } } ////////////////////////////////////////////////////////////////////////// - void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); //convert to sign and magnitude @@ -121,8 +168,8 @@ namespace ojph { for (ui32 i = count; i > 0; --i) { ui32 v = *sp++; - float val = (float)(v & 0x7FFFFFFF) * delta; - *p++ = (v & 0x80000000) ? -val : val; + float val = (float)(v & 0x7FFFFFFFU) * delta; + *p++ = (v & 0x80000000U) ? -val : val; } } diff --git a/src/core/codestream/ojph_codestream_sse.cpp b/src/core/codestream/ojph_codestream_sse.cpp index 7c64ad93..99082aaa 100644 --- a/src/core/codestream/ojph_codestream_sse.cpp +++ b/src/core/codestream/ojph_codestream_sse.cpp @@ -42,7 +42,7 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void sse_mem_clear(void* addr, size_t count) + void sse_mem_clear32(si32* addr, size_t count) { float* p = (float*)addr; __m128 zero = _mm_setzero_ps(); diff --git a/src/core/codestream/ojph_codestream_sse2.cpp b/src/core/codestream/ojph_codestream_sse2.cpp index 9bb06434..145db822 100644 --- a/src/core/codestream/ojph_codestream_sse2.cpp +++ b/src/core/codestream/ojph_codestream_sse2.cpp @@ -42,7 +42,7 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - ui32 sse2_find_max_val(ui32* address) + ui32 sse2_find_max_val32(ui32* address) { __m128i x1, x0 = _mm_loadu_si128((__m128i*)address); x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3] @@ -59,8 +59,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(delta_inv); @@ -88,8 +88,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(K_max); @@ -118,8 +118,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; @@ -141,8 +141,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); __m128i m1 = _mm_set1_epi32(0x7FFFFFFF); diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index 2bd3987f..59839996 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -776,6 +776,22 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + ui32 + param_cod::propose_implementation_precision(const param_siz* siz) const + { + bool employing_color_transform = is_employing_color_transform() ? 1 : 0; + bool reversible = atk->is_reversible(); + + ui32 bit_depth = 32; + if (reversible) { + bit_depth = siz->get_bit_depth(comp_num); + bit_depth += employing_color_transform + get_num_decompositions(); + } + + return bit_depth; + } + ////////////////////////////////////////////////////////////////////////// bool param_cod::write(outfile_base *file) { diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index ac8bb776..8dc1b59c 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -263,6 +263,7 @@ namespace ojph { ui32 t = ojph_div_ceil(Xsiz, ds) - ojph_div_ceil(XOsiz, ds); return t; } + ui32 get_height(ui32 comp_num) const { assert(comp_num < get_num_components()); @@ -516,6 +517,9 @@ namespace ojph { return (Scod & 4) == 4; } + //////////////////////////////////////// + ui32 propose_implementation_precision(const param_siz* siz) const; + //////////////////////////////////////// bool write(outfile_base *file); diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp index 87466e0d..fb4efdfe 100644 --- a/src/core/codestream/ojph_resolution.cpp +++ b/src/core/codestream/ojph_resolution.cpp @@ -199,6 +199,9 @@ namespace ojph { allocator->pre_alloc_obj((size_t)num_precincts.area()); } + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + //allocate lines if (skipped_res_for_recon == false) { @@ -207,10 +210,19 @@ namespace ojph { allocator->pre_alloc_obj(num_steps + 2); ui32 width = res_rect.siz.w + 1; - for (ui32 i = 0; i < num_steps; ++i) + if (precision <= 32) { + for (ui32 i = 0; i < num_steps; ++i) + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); + } + else + { + for (ui32 i = 0; i < num_steps; ++i) + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + } } } @@ -436,6 +448,9 @@ namespace ojph { level_index[i] = level_index[i - 1] + val; cur_precinct_loc = point(0, 0); + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + //allocate lines if (skipped_res_for_recon == false) { @@ -460,11 +475,22 @@ namespace ojph { // initiate storage of line_buf ui32 width = res_rect.siz.w + 1; - for (ui32 i = 0; i < num_steps; ++i) - ssp[i].line->wrap( - allocator->post_alloc_data(width, 1), width, 1); - sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); - aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + if (precision <= 32) + { + for (ui32 i = 0; i < num_steps; ++i) + ssp[i].line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + } + else + { + for (ui32 i = 0; i < num_steps; ++i) + ssp[i].line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + } cur_line = 0; rows_to_produce = res_rect.siz.h; diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp index cf007fc9..8efc8de7 100644 --- a/src/core/codestream/ojph_subband.cpp +++ b/src/core/codestream/ojph_subband.cpp @@ -91,13 +91,18 @@ namespace ojph { allocator->pre_alloc_obj((size_t)num_blocks.area()); for (ui32 i = 0; i < num_blocks.w; ++i) - codeblock::pre_alloc(codestream, nominal); + codeblock::pre_alloc(codestream, comp_num, nominal); //allocate lines allocator->pre_alloc_obj(1); //allocate line_buf ui32 width = band_rect.siz.w + 1; - allocator->pre_alloc_data(width, 1); + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + if (precision <= 32) + allocator->pre_alloc_data(width, 1); + else + allocator->pre_alloc_data(width, 1); } ////////////////////////////////////////////////////////////////////////// @@ -192,7 +197,12 @@ namespace ojph { lines = allocator->post_alloc_obj(1); //allocate line_buf ui32 width = band_rect.siz.w + 1; - lines->wrap(allocator->post_alloc_data(width,1),width,1); + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + if (precision <= 32) + lines->wrap(allocator->post_alloc_data(width, 1), width, 1); + else + lines->wrap(allocator->post_alloc_data(width, 1), width, 1); } ////////////////////////////////////////////////////////////////////////// @@ -256,10 +266,11 @@ namespace ojph { if (empty) return; - assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size); - si32* t = lines[0].i32; - lines[0].i32 = l->i32; - l->i32 = t; + assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size && + l->flags == lines[0].flags); + void* p = lines[0].p; + lines[0].p = l->p; + l->p = p; } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h index 6d8af59f..e1c291a3 100644 --- a/src/core/codestream/ojph_subband.h +++ b/src/core/codestream/ojph_subband.h @@ -94,6 +94,8 @@ namespace ojph { bool exists() { return !empty; } line_buf* pull_line(); + resolution* get_parent() { return parent; } + const resolution* get_parent() const { return parent; } private: bool empty; // true if the subband has no pixels or diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 99fe75d9..11d7b406 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -231,8 +231,7 @@ namespace ojph { num_lines = 3; lines = allocator->post_alloc_obj(num_lines); for (int i = 0; i < 3; ++i) - lines[i].wrap( - allocator->post_alloc_data(width,0),width,0); + lines[i].wrap(allocator->post_alloc_data(width, 0), width, 0); } else { diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder.cpp index 259371b8..aa844d4c 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder.cpp @@ -739,11 +739,11 @@ namespace ojph { * @param [in] stride is the decoded codeblock buffer stride * @param [in] stripe_causal is true for stripe causal mode */ - bool ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data, - ui32 missing_msbs, ui32 num_passes, - ui32 lengths1, ui32 lengths2, - ui32 width, ui32 height, ui32 stride, - bool stripe_causal) + bool ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) { static bool insufficient_precision = false; static bool modify_code = false; @@ -1612,5 +1612,14 @@ namespace ojph { } return true; } + + bool ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, + ui32 stride, bool stripe_causal) + { + return true; + } } -} +} \ No newline at end of file diff --git a/src/core/coding/ojph_block_decoder.h b/src/core/coding/ojph_block_decoder.h index ab019617..a1970174 100644 --- a/src/core/coding/ojph_block_decoder.h +++ b/src/core/coding/ojph_block_decoder.h @@ -50,7 +50,12 @@ namespace ojph { // generic decoder bool - ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data, + ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, bool stripe_causal); + + bool + ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal); diff --git a/src/core/coding/ojph_block_encoder.cpp b/src/core/coding/ojph_block_encoder.cpp index 2023ef19..c56f4e60 100644 --- a/src/core/coding/ojph_block_encoder.cpp +++ b/src/core/coding/ojph_block_encoder.cpp @@ -467,11 +467,11 @@ namespace ojph { // // ////////////////////////////////////////////////////////////////////////// - void ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes, - ui32 width, ui32 height, ui32 stride, - ui32* lengths, - ojph::mem_elastic_allocator *elastic, - ojph::coded_lists *& coded) + void ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded) { assert(num_passes == 1); (void)num_passes; //currently not used @@ -943,5 +943,14 @@ namespace ojph { coded->avail_size -= lengths[0]; } + + void ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded) + { + + } } } diff --git a/src/core/coding/ojph_block_encoder.h b/src/core/coding/ojph_block_encoder.h index 43d32d8b..d2782fb9 100644 --- a/src/core/coding/ojph_block_encoder.h +++ b/src/core/coding/ojph_block_encoder.h @@ -52,11 +52,18 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void - ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes, - ui32 width, ui32 height, ui32 stride, - ui32* lengths, - ojph::mem_elastic_allocator *elastic, - ojph::coded_lists *& coded); + ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded); + + void + ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded); void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs, diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h index 2f074a42..99897f32 100644 --- a/src/core/common/ojph_mem.h +++ b/src/core/common/ojph_mem.h @@ -170,8 +170,9 @@ namespace ojph { ui32 flags; union { si32* i32; // 32bit integer type, used for lossless compression + si64* i64; // 64bit integer type, used for lossless compression float* f32; // float type, used for lossy compression - void* p; // not type is associated with the pointer + void* p; // no type is associated with the pointer }; }; diff --git a/src/core/others/ojph_mem.cpp b/src/core/others/ojph_mem.cpp index b70d51ec..0bb0b5f7 100644 --- a/src/core/others/ojph_mem.cpp +++ b/src/core/others/ojph_mem.cpp @@ -65,22 +65,42 @@ namespace ojph { f32 = p->post_alloc_data(size, pre_size); } + //////////////////////////////////////////////////////////////////////////// + template<> + void line_buf::finalize_alloc(mem_fixed_allocator *p) + { + assert(p != 0 && size != 0); + i64 = p->post_alloc_data(size, pre_size); + } + //////////////////////////////////////////////////////////////////////////// template<> void line_buf::wrap(si32 *buffer, size_t num_ele, ui32 pre_size) { - i32 = buffer; + this->i32 = buffer; this->size = num_ele; this->pre_size = pre_size; + this->flags = LFT_32BIT | LFT_REVERSIBLE; } //////////////////////////////////////////////////////////////////////////// template<> void line_buf::wrap(float *buffer, size_t num_ele, ui32 pre_size) { - f32 = buffer; + this->f32 = buffer; + this->size = num_ele; + this->pre_size = pre_size; + this->flags = LFT_32BIT; + } + + //////////////////////////////////////////////////////////////////////////// + template<> + void line_buf::wrap(si64 *buffer, size_t num_ele, ui32 pre_size) + { + this->i64 = buffer; this->size = num_ele; this->pre_size = pre_size; + this->flags = LFT_64BIT | LFT_REVERSIBLE; } //////////////////////////////////////////////////////////////////////////// diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 6dde7b63..a72cd3d4 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -230,7 +230,7 @@ namespace ojph { else { const si32 *sp = src_line->i32 + src_line_offset; - si64 *dp = (si64*)dst_line->p + dst_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; for (ui32 i = width; i > 0; --i) *dp++ = *sp++ + shift; } @@ -239,7 +239,7 @@ namespace ojph { { assert(src_line->flags | line_buf::LFT_64BIT); assert(dst_line->flags | line_buf::LFT_32BIT); - const si64 *sp = (si64*)src_line->p + src_line_offset; + const si64 *sp = src_line->i64 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; for (ui32 i = width; i > 0; --i) *dp++ = (si32)(*sp++ + shift); @@ -267,7 +267,7 @@ namespace ojph { else { const si32 *sp = src_line->i32 + src_line_offset; - si64 *dp = (si64*)dst_line->p + dst_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; for (ui32 i = width; i > 0; --i) { const si64 v = *sp++; *dp++ = v >= 0 ? v : (- v - shift); @@ -278,7 +278,7 @@ namespace ojph { { assert(src_line->flags | line_buf::LFT_64BIT); assert(dst_line->flags | line_buf::LFT_32BIT); - const si64 *sp = (si64*)src_line->p + src_line_offset; + const si64 *sp = src_line->i64 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; for (ui32 i = width; i > 0; --i) { const si64 v = *sp++; @@ -358,7 +358,7 @@ namespace ojph { (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; - si64 *yp = (si64*)y->p, *cbp = (si64*)cb->p, *crp = (si64*)cr->p; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; for (ui32 i = repeat; i > 0; --i) { si64 rr = *rp++, gg = *gp++, bb = *bp++; @@ -408,7 +408,7 @@ namespace ojph { (r->flags & line_buf::LFT_32BIT) && (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); - const si64 *yp = (si64*)y->p, *cbp = (si64*)cb->p, *crp = (si64*)cr->p; + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; for (ui32 i = repeat; i > 0; --i) { diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp index 09891541..6a3aeaa0 100644 --- a/src/core/transform/ojph_transform.cpp +++ b/src/core/transform/ojph_transform.cpp @@ -115,63 +115,63 @@ namespace ojph { #ifndef OJPH_DISABLE_SIMD - #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) - - #ifndef OJPH_DISABLE_SSE - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) - { - irv_vert_step = sse_irv_vert_step; - irv_vert_times_K = sse_irv_vert_times_K; - irv_horz_ana = sse_irv_horz_ana; - irv_horz_syn = sse_irv_horz_syn; - } - #endif // !OJPH_DISABLE_SSE - - #ifndef OJPH_DISABLE_SSE2 - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) - { - rev_vert_step = sse2_rev_vert_step; - rev_horz_ana = sse2_rev_horz_ana; - rev_horz_syn = sse2_rev_horz_syn; - } - #endif // !OJPH_DISABLE_SSE2 - - #ifndef OJPH_DISABLE_AVX - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX) - { - irv_vert_step = avx_irv_vert_step; - irv_vert_times_K = avx_irv_vert_times_K; - irv_horz_ana = avx_irv_horz_ana; - irv_horz_syn = avx_irv_horz_syn; - } - #endif // !OJPH_DISABLE_AVX - - #ifndef OJPH_DISABLE_AVX2 - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) - { - rev_vert_step = avx2_rev_vert_step; - rev_horz_ana = avx2_rev_horz_ana; - rev_horz_syn = avx2_rev_horz_syn; - } - #endif // !OJPH_DISABLE_AVX2 - - #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) - { - rev_vert_step = avx512_rev_vert_step; - rev_horz_ana = avx512_rev_horz_ana; - rev_horz_syn = avx512_rev_horz_syn; - - irv_vert_step = avx512_irv_vert_step; - irv_vert_times_K = avx512_irv_vert_times_K; - irv_horz_ana = avx512_irv_horz_ana; - irv_horz_syn = avx512_irv_horz_syn; - } - #endif // !OJPH_DISABLE_AVX512 + // #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) + + // #ifndef OJPH_DISABLE_SSE + // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) + // { + // irv_vert_step = sse_irv_vert_step; + // irv_vert_times_K = sse_irv_vert_times_K; + // irv_horz_ana = sse_irv_horz_ana; + // irv_horz_syn = sse_irv_horz_syn; + // } + // #endif // !OJPH_DISABLE_SSE + + // #ifndef OJPH_DISABLE_SSE2 + // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) + // { + // rev_vert_step = sse2_rev_vert_step; + // rev_horz_ana = sse2_rev_horz_ana; + // rev_horz_syn = sse2_rev_horz_syn; + // } + // #endif // !OJPH_DISABLE_SSE2 + + // #ifndef OJPH_DISABLE_AVX + // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX) + // { + // irv_vert_step = avx_irv_vert_step; + // irv_vert_times_K = avx_irv_vert_times_K; + // irv_horz_ana = avx_irv_horz_ana; + // irv_horz_syn = avx_irv_horz_syn; + // } + // #endif // !OJPH_DISABLE_AVX + + // #ifndef OJPH_DISABLE_AVX2 + // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) + // { + // rev_vert_step = avx2_rev_vert_step; + // rev_horz_ana = avx2_rev_horz_ana; + // rev_horz_syn = avx2_rev_horz_syn; + // } + // #endif // !OJPH_DISABLE_AVX2 + + // #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) + // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) + // { + // rev_vert_step = avx512_rev_vert_step; + // rev_horz_ana = avx512_rev_horz_ana; + // rev_horz_syn = avx512_rev_horz_syn; + + // irv_vert_step = avx512_irv_vert_step; + // irv_vert_times_K = avx512_irv_vert_times_K; + // irv_horz_ana = avx512_irv_horz_ana; + // irv_horz_syn = avx512_irv_horz_syn; + // } + // #endif // !OJPH_DISABLE_AVX512 - #elif defined(OJPH_ARCH_ARM) + // #elif defined(OJPH_ARCH_ARM) - #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) + // #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) #endif // !OJPH_DISABLE_SIMD @@ -194,13 +194,13 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) ///////////////////////////////////////////////////////////////////////// - void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + void gen_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; si32* dst = aug->i32; const si32* src1 = sig->i32, * src2 = other->i32; @@ -245,9 +245,83 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + void gen_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + *src1++ + *src2++) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + *src1++ + *src2++) >> e; + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ += (*src1++ + *src2++) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (*src1++ + *src2++) >> e; + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b - (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b - (*src1++ + *src2++)) >> e; + } + else { // general case + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + gen_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + gen_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { @@ -279,7 +353,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; // extension lp[-1] = lp[0]; @@ -321,11 +395,109 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } - - ////////////////////////////////////////////////////////////////////////// - void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + si64* dph = hdst->i64; + si64* dpl = ldst->i64; + si64* sp = src->i64; + ui32 w = width; + if (!even) + { + *dph++ = *sp++; --w; + } + for (; w > 1; w -= 2) + { + *dpl++ = *sp++; *dph++ = *sp++; + } + if (w) + { + *dpl++ = *sp++; --w; + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp + (even ? 1 : 0); + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + (sp[-1] + sp[0])) >> e; + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp -= (sp[-1] + sp[0]) >> e; + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b - (sp[-1] + sp[0])) >> e; + } + else { + // general case + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + gen_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + gen_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + void gen_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -339,7 +511,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; // extension oth[-1] = oth[0]; @@ -400,6 +572,104 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void gen_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth + (ev ? 0 : 1); + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + (sp[-1] + sp[0])) >> e; + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp += (sp[-1] + sp[0]) >> e; + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b - (sp[-1] + sp[0])) >> e; + } + else { + // general case + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + si64* sph = hsrc->i64; + si64* spl = lsrc->i64; + si64* dp = dst->i64; + ui32 w = width; + if (!even) + { + *dp++ = *sph++; --w; + } + for (; w > 1; w -= 2) + { + *dp++ = *spl++; *dp++ = *sph++; + } + if (w) + { + *dp++ = *spl++; --w; + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + gen_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + gen_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + ////////////////////////////////////////////////////////////////////////// void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, const line_buf* other, const line_buf* aug, From b4e38790206c72c99a78cd48d9be6262f9391b58 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 1 Nov 2024 21:52:55 +1100 Subject: [PATCH 38/78] 64 bit block encoder/decoder done. No SIMD. Bug in Tiles. --- src/core/codestream/ojph_codeblock.cpp | 4 +- src/core/codestream/ojph_codeblock_fun.cpp | 2 +- src/core/coding/ojph_block_decoder.cpp | 1091 +++++++++++++++++++- src/core/coding/ojph_block_encoder.cpp | 620 ++++++++++- src/core/common/ojph_arch.h | 55 +- src/core/transform/ojph_transform.cpp | 68 +- 6 files changed, 1757 insertions(+), 83 deletions(-) diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp index 45c91416..53d9a6b1 100644 --- a/src/core/codestream/ojph_codeblock.cpp +++ b/src/core/codestream/ojph_codeblock.cpp @@ -64,8 +64,8 @@ namespace ojph { const param_siz* sz = codestream->get_siz(); const param_cod* cd = codestream->get_cod(comp_num); - ui32 bit_depth = cd->propose_implementation_precision(sz); - if (bit_depth <= 32) + ui32 precision = cd->propose_implementation_precision(sz); + if (precision <= 32) allocator->pre_alloc_data(nominal.h * stride, 0); else allocator->pre_alloc_data(nominal.h * stride, 0); diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index 8cc034a7..4474428f 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -165,7 +165,7 @@ namespace ojph { #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) - // Accelerated functions for INTEL/AMD CPUs + // Accelerated functions for INTEL/AMD CPUs #ifndef OJPH_DISABLE_SSE if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) mem_clear32 = sse_mem_clear32; diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder.cpp index aa844d4c..f2dd1c08 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder.cpp @@ -423,6 +423,97 @@ namespace ojph { return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp } + //************************************************************************/ + /** @brief Read and unstuff data from a backwardly-growing segment + * + * This reader reads 8 bits from the VLC segment. It fills zeros when + * the buffer is exhausted; we basically do not care about these zeros + * because we should not need them -- any extra data should not be used + * in the actual decoding. If these bytes are needed, then there is a + * problem in the bitstream, but we do not flag this error. + * + * Unstuffing is needed to prevent sequences larger than 0xFF8F from + * appearing in the bits stream; since we are reading backward, we keep + * watch when a value larger than 0x8F appears in the bitstream. + * If the byte following this is 0x7F, we unstuff this byte (ignore the + * MSB of that byte, which should be 0). + * + * @param [in] vlcp is a pointer to rev_struct structure + */ + static inline + void rev_read8(rev_struct *vlcp) + { + // process 1 bytes + ui8 val = 0; // insert 0s at the end -- the standard says that the + // bitstream must contain all needed bits. Therefore + // if the whole bitstream is consumed and bits are still + // needed, then this is an error condition, but we are + // lenient -- it is also possible that we are decoding + // more bits than what we are actually need. + if (vlcp->size > 0) // if there are more than 3 bytes left in VLC + { + val = *vlcp->data; // then read 8 bits + --vlcp->data; // increment data pointer + --vlcp->size; // decrement number of bytes in the buffer + } + + // accumulate in tmp, and increment bits, check if unstuffing is needed + ui8 t = (vlcp->unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0; + val = val & (0xFFU >> t); // protect against erroneous 1 in MSB + vlcp->tmp |= (ui64)val << vlcp->bits; + vlcp->bits += 8 - t; + vlcp->unstuff = val > 0x8F; + } + + //************************************************************************/ + /** @brief Initiates the rev_struct structure and reads the first byte + * + * This subroutine initializes the VLC decoder. It discards the first + * 12 bits (they have the sum of the lengths of VLC and MEL segments), + * and depending on unstuffing, stores 3 or 4 bits in the unstuffed + * decoded buffer. + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void rev_init8(rev_struct *vlcp, ui8* data, int lcup, int scup) + { + //first byte has only the upper 4 bits + vlcp->data = data + lcup - 2; + + //size can not be larger than this, in fact it should be smaller + vlcp->size = scup - 2; + + ui8 val = *vlcp->data--; // read one byte (this is a half byte) + + // the first byte is treated different to other bytes, because only + // the MSB nibble is part of the VLC code. + val = val >> 4; + ui8 t = ((val & 0x7) == 0x7) ? 1 : 0; // unstuffing is needed + val = val & (0xFU >> t); // protect against erroneous 1 in MSB + vlcp->tmp = val; + vlcp->bits = 4 - t; + vlcp->unstuff = val > 0x8; //this is useful for the next byte + } + + //************************************************************************/ + /** @brief Fills the temporary variable (vlcp->tmp) by up to 64 bits + * + * By the end of this call, vlcp->tmp must have no less than 56 bits + * + * @param [in] vlcp is a pointer to rev_struct structure + */ + static inline + ui64 rev_fetch64(rev_struct *vlcp) + { + while (vlcp->bits <= 56) + rev_read8(vlcp); // read 8 bits, but unstuffing might reduce this + return vlcp->tmp; // return unstuff decoded bits + } + //************************************************************************/ /** @brief Consumes num_bits from a rev_struct structure * @@ -438,6 +529,21 @@ namespace ojph { return (ui32)vlcp->tmp; } + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui64 rev_advance64(rev_struct *vlcp, ui32 num_bits) + { + assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits + vlcp->tmp >>= num_bits; // remove bits + vlcp->bits -= num_bits; // decrement the number of bits + return vlcp->tmp; + } + //************************************************************************/ /** @brief Reads and unstuffs from rev_struct * @@ -652,6 +758,41 @@ namespace ojph { msp->bits += bits; } + //************************************************************************/ + /** @brief Read and unstuffs 8 bits from forward-growing bitstream + * + * A template is used to accommodate a different requirement for + * MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is + * consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in. + * X controls this value. + * + * Unstuffing prevent sequences that are more than 0xFF7F from appearing + * in the conpressed sequence. So whenever a value of 0xFF is coded, the + * MSB of the next byte is set 0 and must be ignored during decoding. + * + * @tparam X is the value fed in when the bitstream is exhausted + * @param [in] msp is a pointer to frwd_struct structure + * + */ + template + static inline + void frwd_read8(frwd_struct *msp) + { + ui8 val = X; + if (msp->size > 0) { + val = *msp->data; // read 8 bits + ++msp->data; // increment pointer + --msp->size; // reduce size + } + + // unstuff and accumulate + ui8 t = msp->unstuff ? 1 : 0; + val = val & (0xFFU >> t); + msp->unstuff = (val == 0xFF); + msp->tmp |= ((ui64)val) << msp->bits; // move data to msp->tmp + msp->bits += 8 - t; + } + //************************************************************************/ /** @brief Initialize frwd_struct struct and reads some bytes * @@ -688,6 +829,27 @@ namespace ojph { frwd_read(msp); // read 32 bits more } + //************************************************************************/ + /** @brief Initialize frwd_struct struct and reads some bytes + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + * @param [in] data is a pointer to the start of data + * @param [in] size is the number of byte in the bitstream + */ + template + static inline + void frwd_init8(frwd_struct *msp, const ui8* data, int size) + { + msp->data = data; + msp->tmp = 0; + msp->bits = 0; + msp->unstuff = 0; + msp->size = size; + frwd_read8(msp); // read 8 bits + } + //************************************************************************/ /** @brief Consume num_bits bits from the bitstream of frwd_struct * @@ -722,6 +884,22 @@ namespace ojph { return (ui32)msp->tmp; } + //************************************************************************/ + /** @brief Fetches up to 64 bits from the frwd_struct bitstream + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + */ + template + static inline + ui64 frwd_fetch64(frwd_struct *msp) + { + while (msp->bits <= 56) + frwd_read8(msp); + return msp->tmp; + } + //************************************************************************/ /** @brief Decodes one codeblock, processing the cleanup, siginificance * propagation, and magnitude refinement pass @@ -1217,7 +1395,7 @@ namespace ojph { ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1? ui32 emax = vp[0] | vp[1]; - emax = 31 - count_leading_zeros(emax | 2); // emax - 1 + emax = 31 - count_leading_zeros(emax | 2); // emax - 1 ui32 kappa = gamma ? emax : 1; ui32 U_q = u_q + kappa; @@ -1613,12 +1791,915 @@ namespace ojph { return true; } + //************************************************************************/ + /** @brief Decodes one codeblock, processing the cleanup, siginificance + * propagation, and magnitude refinement pass + * + * @param [in] coded_data is a pointer to bitstream + * @param [in] decoded_data is a pointer to decoded codeblock data buf. + * @param [in] missing_msbs is the number of missing MSBs + * @param [in] num_passes is the number of passes: 1 if CUP only, + * 2 for CUP+SPP, and 3 for CUP+SPP+MRP + * @param [in] lengths1 is the length of cleanup pass + * @param [in] lengths2 is the length of refinement passes (either SPP + * only or SPP+MRP) + * @param [in] width is the decoded codeblock width + * @param [in] height is the decoded codeblock height + * @param [in] stride is the decoded codeblock buffer stride + * @param [in] stripe_causal is true for stripe causal mode + */ bool ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data, - ui32 missing_msbs, ui32 num_passes, - ui32 lengths1, ui32 lengths2, - ui32 width, ui32 height, - ui32 stride, bool stripe_causal) + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) { + static bool insufficient_precision = false; + static bool modify_code = false; + static bool truncate_spp_mrp = false; + + if (num_passes > 1 && lengths2 == 0) + { + OJPH_WARN(0x00010001, "A malformed codeblock that has more than " + "one coding pass, but zero length for " + "2nd and potential 3rd pass."); + num_passes = 1; + } + + if (num_passes > 3) + { + OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " + "This codeblocks has %d passes.", + num_passes); + return false; + } + + // if (missing_msbs > 30) // p < 0 + // { + // if (insufficient_precision == false) + // { + // insufficient_precision = true; + // OJPH_WARN(0x00010003, "32 bits are not enough to decode this " + // "codeblock. This message will not be " + // "displayed again."); + // } + // return false; + // } + // else if (missing_msbs == 30) // p == 0 + // { // not enough precision to decode and set the bin center to 1 + // if (modify_code == false) { + // modify_code = true; + // OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " + // "pass. The code can be modified to support " + // "this case. This message will not be " + // "displayed again."); + // } + // return false; // 32 bits are not enough to decode this + // } + // else if (missing_msbs == 29) // if p is 1, then num_passes must be 1 + // { + // if (num_passes > 1) { + // num_passes = 1; + // if (truncate_spp_mrp == false) { + // truncate_spp_mrp = true; + // OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " + // "nor MagRef passes; both will be skipped. " + // "This message will not be displayed " + // "again."); + // } + // } + // } + ui32 p = 62 - missing_msbs; // The least significant bitplane for CUP + // There is a way to handle the case of p == 0, but a different path + // is required + + if (lengths1 < 2) + { + OJPH_WARN(0x00010006, "Wrong codeblock length."); + return false; + } + + // read scup and fix the bytes there + int lcup, scup; + lcup = (int)lengths1; // length of CUP + //scup is the length of MEL + VLC + scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF); + if (scup < 2 || scup > lcup || scup > 4079) //something is wrong + return false; + + // The temporary storage scratch holds two types of data in an + // interleaved fashion. The interleaving allows us to use one + // memory pointer. + // We have one entry for a decoded VLC code, and one entry for UVLC. + // Entries are 16 bits each, corresponding to one quad, + // but since we want to use XMM registers of the SSE family + // of SIMD; we allocated 16 bytes or more per quad row; that is, + // the width is no smaller than 16 bytes (or 8 entries), and the + // height is 512 quads + // Each VLC entry contains, in the following order, starting + // from MSB + // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits) + // Each entry in UVLC contains u_q + // One extra row to handle the case of SPP propagating downwards + // when codeblock width is 4 + ui16 scratch[8 * 513] = {0}; // 8 kB + + // We need an extra two entries (one inf and one u_q) beyond + // the last column. + // If the block width is 4 (2 quads), then we use sstr of 8 + // (enough for 4 quads). If width is 8 (4 quads) we use + // sstr is 16 (enough for 8 quads). For a width of 16 (8 + // quads), we use 24 (enough for 12 quads). + ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8 + + ui32 mmsbp2 = missing_msbs + 2; + + // The cleanup pass is decoded in two steps; in step one, + // the VLC and MEL segments are decoded, generating a record that + // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k. + // This information should be sufficient for the next step. + // In step 2, we decode the MagSgn segment. + + // step 1 decoding VLC and MEL segments + { + // init structures + dec_mel_st mel; + mel_init(&mel, coded_data, lcup, scup); + rev_struct vlc; + rev_init8(&vlc, coded_data, lcup, scup); + + int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm + // data represented as runs of 0 events + // See mel_decode description + + ui64 vlc_val; + ui32 c_q = 0; + ui16 *sp = scratch; + //initial quad row + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // first quad + vlc_val = rev_fetch64(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 1 in ITU T.814 + c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2); + + //remove data from vlc stream (0 bits are removed if vlc is not used) + vlc_val = rev_advance64(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + //prepare context for the next quad, eqn. 1 in ITU T.814 + c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2); + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance64(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from + { // the MEL run of events + run -= 2; //subtract 2, since events number if multiplied by 2 + + uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by + // is 0x40 + + if (run < 0)//if run is consumed (run is -1 or -2), get another run + run = mel_get_run(&mel); + } + //run -= (uvlc_mode == 0xc0) ? 2 : 0; + //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + + //decode uvlc_mode to get u for both quads + ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance64(&vlc, len); + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q0 = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<> 3) + (tmp >> len)); //kappa == 1 + + // decode u_q extensions, which is needed only when u_q > 32 + ui32 u_ext; bool cond0, cond1; + cond0 = u_q0 > 32; + u_ext = cond0 ? (uvlc_entry & 0xF) : 0; + vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); + u_q0 += u_ext << 2; + sp[1] = u_q0; + cond1 = u_q1 > 32; + u_ext = cond1 ? (uvlc_entry & 0xF) : 0; + vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); + u_q0 += u_ext << 2; + sp[3] = u_q1; + } + sp[0] = sp[1] = 0; + + //non initial quad rows + for (ui32 y = 2; y < height; y += 2) + { + c_q = 0; // context + ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads + + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // sigma_q (n, ne, nf) + c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4); + + // first quad + vlc_val = rev_fetch64(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number is multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 2 in ITU T.814 + // sigma_q (w, sw) + c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[0 - (si32)sstr] & 0x80; + // sigma_q (n, ne, nf) + c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4); + + //remove data from vlc stream (0 bits are removed if vlc is unused) + vlc_val = rev_advance64(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + // partial c_q, will be completed when we process the next quad + // sigma_q (w, sw) + c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[2 - (si32)sstr] & 0x80; + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance64(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance64(&vlc, len); + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); + ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q + + // decode u_q extensions, which is needed only when u_q > 32 + ui32 u_ext; bool cond0, cond1; + cond0 = u_q0 > 32; + u_ext = cond0 ? (uvlc_entry & 0xF) : 0; + vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); + u_q0 += u_ext << 2; + sp[1] = u_q0; + cond1 = u_q1 > 32; + u_ext = cond1 ? (uvlc_entry & 0xF) : 0; + vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); + u_q0 += u_ext << 2; + sp[3] = u_q1; + } + sp[0] = sp[1] = 0; + } + } + + // step2 we decode magsgn + { + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We need an extra entry to handle the case of vp[1] + // when vp is at the last column. + // Here, we allocate 4 instead of 1 to make the buffer size + // a multipled of 16 bytes. + const int v_n_size = 512 + 4; + ui64 v_n_scratch[v_n_size] = {0}; // 4+ kB + + frwd_struct magsgn; + frwd_init8<0xFF>(&magsgn, coded_data, lcup - scup); + + const ui16 *sp = scratch; + ui64 *vp = v_n_scratch; + ui64 *dp = decoded_data; + + ui64 prev_v_n = 0; + for (ui32 x = 0; x < width; sp += 2, ++vp) + { + ui32 inf = sp[0]; + ui32 U_q = sp[1]; + if (U_q > mmsbp2) + return false; + + ui64 v_n; + ui64 val = 0; + ui32 bit = 0; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= width) + { ++vp; break; } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + + for (ui32 y = 2; y < height; y += 2) + { + const ui16 *sp = scratch + (y >> 1) * sstr; + ui64 *vp = v_n_scratch; + ui64 *dp = decoded_data + y * stride; + + prev_v_n = 0; + for (ui32 x = 0; x < width; sp += 2, ++vp) + { + ui32 inf = sp[0]; + ui32 u_q = sp[1]; + + ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1? + ui32 emax = 63 - count_leading_zeros(2 | vp[0] | vp[1]); // emax-1 + ui32 kappa = gamma ? emax : 1; + + ui32 U_q = u_q + kappa; + if (U_q > mmsbp2) + return false; + + ui64 v_n; + ui64 val = 0; + ui32 bit = 0; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= width) + { ++vp; break; } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + } + } + + if (num_passes > 1) + { + // We use scratch again, we can divide it into multiple regions + // sigma holds all the significant samples, and it cannot + // be modified after it is set. it will be used during the + // Magnitude Refinement Pass + ui16* const sigma = scratch; + + ui32 mstr = (width + 3u) >> 2; // divide by 4, since each + // ui16 contains 4 columns + mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8 + + // We re-arrange quad significance, where each 4 consecutive + // bits represent one quad, into column significance, where, + // each 4 consequtive bits represent one column of 4 rows + { + ui32 y; + for (y = 0; y < height; y += 4) + { + ui16* sp = scratch + (y >> 1) * sstr; + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 4, sp += 4, ++dp) { + ui32 t0 = 0, t1 = 0; + t0 = ((sp[0 ] & 0x30u) >> 4) | ((sp[0 ] & 0xC0u) >> 2); + t0 |= ((sp[2 ] & 0x30u) << 4) | ((sp[2 ] & 0xC0u) << 6); + t1 = ((sp[0+sstr] & 0x30u) >> 2) | ((sp[0+sstr] & 0xC0u) ); + t1 |= ((sp[2+sstr] & 0x30u) << 6) | ((sp[2+sstr] & 0xC0u) << 8); + dp[0] = (ui16)(t0 | t1); + } + dp[0] = 0; // set an extra entry on the right with 0 + } + { + // reset one row after the codeblock + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 4, ++dp) + dp[0] = 0; + dp[0] = 0; // set an extra entry on the right with 0 + } + } + + // We perform Significance Propagation Pass here + { + // This stores significance information of the previous + // 4 rows. Significance information in this array includes + // all signicant samples in bitplane p - 1; that is, + // significant samples for bitplane p (discovered during the + // cleanup pass and stored in sigma) and samples that have recently + // became significant (during the SPP) in bitplane p-1. + // We store enough for the widest row, containing 1024 columns, + // which is equivalent to 256 of ui16, since each stores 4 columns. + // We add an extra 8 entries, just in case we need more + ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes + + frwd_struct sigprop; + frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 pattern = 0xFFFFu; // a pattern needed samples + if (height - y < 4) { + pattern = 0x7777u; + if (height - y < 3) { + pattern = 0x3333u; + if (height - y < 2) + pattern = 0x1111u; + } + } + + // prev holds sign. info. for the previous quad, together + // with the rows on top of it and below it. + ui32 prev = 0; + ui16 *prev_sig = prev_row_sig; + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui64 *dpp = decoded_data + y * stride; + for (ui32 x = 0; x < width; x += 4, ++cur_sig, ++prev_sig) + { + // only rows and columns inside the stripe are included + si32 s = (si32)x + 4 - (si32)width; + s = ojph_max(s, 0); + pattern = pattern >> (s * 4); + + // We first find locations that need to be tested (potential + // SPP members); these location will end up in mbr + // In each iteration, we produce 16 bits because cwd can have + // up to 16 bits of significance information, followed by the + // corresponding 16 bits of sign information; therefore, it is + // sufficient to fetch 32 bit data per loop. + + // Althougth we are interested in 16 bits only, we load 32 bits. + // For the 16 bits we are producing, we need the next 4 bits -- + // We need data for at least 5 columns out of 8. + // Therefore loading 32 bits is easier than loading 16 bits + // twice. + ui32 ps = *(ui32*)prev_sig; + ui32 ns = *(ui32*)(cur_sig + mstr); + ui32 u = (ps & 0x88888888) >> 3; // the row on top + if (!stripe_causal) + u |= (ns & 0x11111111) << 3; // the row below + + ui32 cs = *(ui32*)cur_sig; + // vertical integration + ui32 mbr = cs; // this sig. info. + mbr |= (cs & 0x77777777) << 1; //above neighbors + mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors + mbr |= u; + // horizontal integration + ui32 t = mbr; + mbr |= t << 4; // neighbors on the left + mbr |= t >> 4; // neighbors on the right + mbr |= prev >> 12; // significance of previous group + + // remove outside samples, and already significant samples + mbr &= pattern; + mbr &= ~cs; + + // find samples that become significant during the SPP + ui32 new_sig = mbr; + if (new_sig) + { + ui64 cwd = frwd_fetch<0>(&sigprop); + + ui32 cnt = 0; + ui32 col_mask = 0xFu; + ui32 inv_sig = ~cs & pattern; + for (int i = 0; i < 16; i += 4, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan one column + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x33u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x76u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xECu << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xC8u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + } + + if (new_sig) + { + // new_sig has newly-discovered sig. samples during SPP + // find the signs and update decoded_data + ui64 *dp = dpp + x; + ui64 val = 3u << (p - 2); + col_mask = 0xFu; + for (int i = 0; i < 4; ++i, ++dp, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan 4 signs + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + assert(dp[0] == 0); + dp[0] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[stride] == 0); + dp[stride] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[2 * stride] == 0); + dp[2 * stride] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[3 * stride] == 0); + dp[3 * stride] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + } + } + frwd_advance(&sigprop, cnt); + } + + new_sig |= cs; + *prev_sig = (ui16)(new_sig); + + // vertical integration for the new sig. info. + t = new_sig; + new_sig |= (t & 0x7777) << 1; //above neighbors + new_sig |= (t & 0xEEEE) >> 1; //below neighbors + // add sig. info. from the row on top and below + prev = new_sig | u; + // we need only the bits in 0xF000 + prev &= 0xF000; + } + } + } + + // We perform Magnitude Refinement Pass here + if (num_passes > 2) + { + rev_struct magref; + rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 *cur_sig = (ui32*)(sigma + (y >> 2) * mstr); + ui64 *dpp = decoded_data + y * stride; + ui64 half = 1ULL << (p - 2); + for (ui32 i = 0; i < width; i += 8) + { + //Process one entry from sigma array at a time + // Each nibble (4 bits) in the sigma array represents 4 rows, + // and the 32 bits contain 8 columns + ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data + ui32 sig = *cur_sig++; // 32 bit that will be processed now + ui32 col_mask = 0xFu; // a mask for a column in sig + if (sig) // if any of the 32 bits are set + { + for (int j = 0; j < 8; ++j) //one column at a time + { + if (sig & col_mask) // lowest nibble + { + ui64 *dp = dpp + i + j; // next column in decoded samples + ui32 sample_mask = 0x11111111u & col_mask; //LSB + + for (int k = 0; k < 4; ++k) { + if (sig & sample_mask) //if LSB is set + { + assert(dp[0] != 0); // decoded value cannot be zero + assert((dp[0] & half) == 0); // no half + ui64 sym = cwd & 1; // get it value + sym = (1 - sym) << (p - 1); // previous center of bin + sym |= half; // put half the center of bin + dp[0] ^= sym; // remove old bin center and put new + cwd >>= 1; // consume word + } + sample_mask += sample_mask; //next row + dp += stride; // next samples row + } + } + col_mask <<= 4; //next column + } + } + // consume data according to the number of bits set + rev_advance_mrp(&magref, population_count(sig)); + } + } + } + } return true; } } diff --git a/src/core/coding/ojph_block_encoder.cpp b/src/core/coding/ojph_block_encoder.cpp index c56f4e60..87e8810d 100644 --- a/src/core/coding/ojph_block_encoder.cpp +++ b/src/core/coding/ojph_block_encoder.cpp @@ -65,11 +65,12 @@ namespace ojph { static ui16 vlc_tbl1[2048] = { 0 }; //UVLC encoding - static int ulvc_cwd_pre[33]; - static int ulvc_cwd_pre_len[33]; - static int ulvc_cwd_suf[33]; - static int ulvc_cwd_suf_len[33]; - + const int num_uvlc_entries = 74; + struct uvlc_tbl_struct { + ui8 pre, pre_len, suf, suf_len, ext, ext_len; + }; + static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries]; + ///////////////////////////////////////////////////////////////////////// static bool vlc_init_tables() { @@ -194,23 +195,61 @@ namespace ojph { static bool uvlc_init_tables() { //code goes from 0 to 31, extension and 32 are not supported here - ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2; - ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4; - ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1; - ulvc_cwd_pre_len[2] = 2; - ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3; - ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0; - ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1; - ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0; - ulvc_cwd_suf_len[2] = 0; - ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1; + uvlc_tbl[0].pre = 0; + uvlc_tbl[0].pre_len = 0; + uvlc_tbl[0].suf = 0; + uvlc_tbl[0].suf_len = 0; + uvlc_tbl[0].ext = 0; + uvlc_tbl[0].ext_len = 0; + + uvlc_tbl[1].pre = 1; + uvlc_tbl[1].pre_len = 1; + uvlc_tbl[1].suf = 0; + uvlc_tbl[1].suf_len = 0; + uvlc_tbl[1].ext = 0; + uvlc_tbl[1].ext_len = 0; + + uvlc_tbl[2].pre = 2; + uvlc_tbl[2].pre_len = 2; + uvlc_tbl[2].suf = 0; + uvlc_tbl[2].suf_len = 0; + uvlc_tbl[2].ext = 0; + uvlc_tbl[2].ext_len = 0; + + uvlc_tbl[3].pre = 4; + uvlc_tbl[3].pre_len = 3; + uvlc_tbl[3].suf = 0; + uvlc_tbl[3].suf_len = 1; + uvlc_tbl[3].ext = 0; + uvlc_tbl[3].ext_len = 0; + + uvlc_tbl[4].pre = 4; + uvlc_tbl[4].pre_len = 3; + uvlc_tbl[4].suf = 1; + uvlc_tbl[4].suf_len = 1; + uvlc_tbl[4].ext = 0; + uvlc_tbl[4].ext_len = 0; + for (int i = 5; i < 33; ++i) { - ulvc_cwd_pre[i] = 0; - ulvc_cwd_pre_len[i] = 3; - ulvc_cwd_suf[i] = i-5; - ulvc_cwd_suf_len[i] = 5; + uvlc_tbl[i].pre = 0; + uvlc_tbl[i].pre_len = 3; + uvlc_tbl[i].suf = i - 5; + uvlc_tbl[i].suf_len = 5; + uvlc_tbl[i].ext = 0; + uvlc_tbl[i].ext_len = 0; + } + + for (int i = 33; i < 75; ++i) + { + uvlc_tbl[i].pre = 0; + uvlc_tbl[i].pre_len = 3; + uvlc_tbl[i].suf = 28 + (i - 33) % 4; + uvlc_tbl[i].suf_len = 5; + uvlc_tbl[i].ext = (i - 33) / 4; + uvlc_tbl[i].ext_len = 4; } + return true; } @@ -440,6 +479,29 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline void + ms_encode64(ms_struct* msp, ui64 cwd, int cwd_len) + { + while (cwd_len > 0) + { + if (msp->pos >= msp->buf_size) + OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full"); + int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len); + msp->tmp |= (cwd & ((1ULL << t) - 1)) << msp->used_bits; + msp->used_bits += t; + cwd >>= t; + cwd_len -= t; + if (msp->used_bits >= msp->max_bits) + { + msp->buf[msp->pos++] = (ui8)msp->tmp; + msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8; + msp->tmp = 0; + msp->used_bits = 0; + } + } + } + ////////////////////////////////////////////////////////////////////////// static inline void ms_terminate(ms_struct* msp) @@ -693,23 +755,23 @@ namespace ojph { if (u_q0 > 2 && u_q1 > 2) { - vlc_encode(&vlc, ulvc_cwd_pre[u_q0-2], ulvc_cwd_pre_len[u_q0-2]); - vlc_encode(&vlc, ulvc_cwd_pre[u_q1-2], ulvc_cwd_pre_len[u_q1-2]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q0-2], ulvc_cwd_suf_len[u_q0-2]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q1-2], ulvc_cwd_suf_len[u_q1-2]); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len); } else if (u_q0 > 2 && u_q1 > 0) { - vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]); + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); vlc_encode(&vlc, u_q1 - 1, 1); - vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); } else { - vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]); - vlc_encode(&vlc, ulvc_cwd_pre[u_q1], ulvc_cwd_pre_len[u_q1]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q1], ulvc_cwd_suf_len[u_q1]); + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len); } //prepare for next iteration @@ -910,10 +972,10 @@ namespace ojph { ms_encode(&ms, s[7] & ((1U<> 1) | ((rho[1] & 8) >> 2); @@ -944,13 +1006,495 @@ namespace ojph { coded->avail_size -= lengths[0]; } + ////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + ////////////////////////////////////////////////////////////////////////// void ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes, - ui32 width, ui32 height, ui32 stride, - ui32* lengths, - ojph::mem_elastic_allocator *elastic, - ojph::coded_lists *& coded) + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded) { - + assert(num_passes == 1); + (void)num_passes; //currently not used + const int ms_size = (16384*16+14)/15; //more than enough + ui8 ms_buf[ms_size]; + const int mel_vlc_size = 3072; //more than enough + ui8 mel_vlc_buf[mel_vlc_size]; + const int mel_size = 192; + ui8 *mel_buf = mel_vlc_buf; + const int vlc_size = mel_vlc_size - mel_size; + ui8 *vlc_buf = mel_vlc_buf + mel_size; + + mel_struct mel; + mel_init(&mel, mel_size, mel_buf); + vlc_struct vlc; + vlc_init(&vlc, vlc_size, vlc_buf); + ms_struct ms; + ms_init(&ms, ms_size, ms_buf); + + ui32 p = 62 - missing_msbs; + + //e_val: E values for a line (these are the highest set bit) + //cx_val: is the context values + //Each byte stores the info for the 2 sample. For E, it is maximum + // of the two samples, while for cx, it is the OR of these two samples. + //The maximum is between the pixel at the bottom left of one quad + // and the bottom right of the earlier quad. The same is true for cx. + //For a 1024 pixels, we need 512 bytes, the 2 extra, + // one for the non-existing earlier quad, and one for beyond the + // the end + ui8 e_val[513]; + ui8 cx_val[513]; + ui8* lep = e_val; lep[0] = 0; + ui8* lcxp = cx_val; lcxp[0] = 0; + + //initial row of quads + int e_qmax[2] = {0,0}, e_q[8] = {0,0,0,0,0,0,0,0}; + int rho[2] = {0,0}; + int c_q0 = 0; + ui64 s[8] = {0,0,0,0,0,0,0,0}, val, t; + ui32 y = 0; + ui64 *sp = buf; + for (ui32 x = 0; x < width; x += 4) + { + //prepare two quads + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL; // 2 \mu_p + if (val) + { + rho[0] = 1; + e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = e_q[0]; + s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 2; + e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[1]); + s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 1 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 4; + e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[2]); + s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 8; + e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[3]); + s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + int Uq0 = ojph_max(e_qmax[0], 1); //kappa_q = 1 + int u_q0 = Uq0 - 1, u_q1 = 0; //kappa_q = 1 + + int eps0 = 0; + if (u_q0 > 0) + { + eps0 |= (e_q[0] == e_qmax[0]); + eps0 |= (e_q[1] == e_qmax[0]) << 1; + eps0 |= (e_q[2] == e_qmax[0]) << 2; + eps0 |= (e_q[3] == e_qmax[0]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++; + lep[0] = (ui8)e_q[3]; + lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++; + lcxp[0] = (ui8)((rho[0] & 8) >> 3); + + ui16 tuple0 = vlc_tbl0[(c_q0 << 8) + (rho[0] << 4) + eps0]; + vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7); + + if (c_q0 == 0) + mel_encode(&mel, rho[0] != 0); + + int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0; + ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m); + m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0; + ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m); + m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0; + ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m); + m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0; + ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m); + + if (x + 2 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] = 1; + e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = e_q[4]; + s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 2; + e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[5]); + s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 3 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 4; + e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[6]); + s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 8; + e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[7]); + s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + int c_q1 = (rho[0] >> 1) | (rho[0] & 1); + int Uq1 = ojph_max(e_qmax[1], 1); //kappa_q = 1 + u_q1 = Uq1 - 1; //kappa_q = 1 + + int eps1 = 0; + if (u_q1 > 0) + { + eps1 |= (e_q[4] == e_qmax[1]); + eps1 |= (e_q[5] == e_qmax[1]) << 1; + eps1 |= (e_q[6] == e_qmax[1]) << 2; + eps1 |= (e_q[7] == e_qmax[1]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++; + lep[0] = (ui8)e_q[7]; + lcxp[0] |= (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++; + lcxp[0] = (ui8)((rho[1] & 8) >> 3); + ui16 tuple1 = vlc_tbl0[(c_q1 << 8) + (rho[1] << 4) + eps1]; + vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7); + + if (c_q1 == 0) + mel_encode(&mel, rho[1] != 0); + + int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0; + ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m); + m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0; + ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m); + m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0; + ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m); + m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0; + ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m); + } + + if (u_q0 > 0 && u_q1 > 0) + mel_encode(&mel, ojph_min(u_q0, u_q1) > 2); + + if (u_q0 > 2 && u_q1 > 2) + { + vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].ext, uvlc_tbl[u_q0-2].ext_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].ext, uvlc_tbl[u_q1-2].ext_len); + } + else if (u_q0 > 2 && u_q1 > 0) + { + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, u_q1 - 1, 1); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len); + } + else + { + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len); + } + + //prepare for next iteration + c_q0 = (rho[1] >> 1) | (rho[1] & 1); + s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0; + e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0; + rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0; + } + + lep[1] = 0; + + for (y = 2; y < height; y += 2) + { + lep = e_val; + int max_e = ojph_max(lep[0], lep[1]) - 1; + lep[0] = 0; + lcxp = cx_val; + c_q0 = lcxp[0] + (lcxp[1] << 2); + lcxp[0] = 0; + + sp = buf + y * stride; + for (ui32 x = 0; x < width; x += 4) + { + //prepare two quads + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] = 1; + e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = e_q[0]; + s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 2; + e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[1]); + s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 1 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 4; + e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[2]); + s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 8; + e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[3]); + s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + int kappa = (rho[0] & (rho[0]-1)) ? ojph_max(1,max_e) : 1; + int Uq0 = ojph_max(e_qmax[0], kappa); + int u_q0 = Uq0 - kappa, u_q1 = 0; + + int eps0 = 0; + if (u_q0 > 0) + { + eps0 |= (e_q[0] == e_qmax[0]); + eps0 |= (e_q[1] == e_qmax[0]) << 1; + eps0 |= (e_q[2] == e_qmax[0]) << 2; + eps0 |= (e_q[3] == e_qmax[0]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++; + max_e = ojph_max(lep[0], lep[1]) - 1; + lep[0] = (ui8)e_q[3]; + lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++; + int c_q1 = lcxp[0] + (lcxp[1] << 2); + lcxp[0] = (ui8)((rho[0] & 8) >> 3); + ui16 tuple0 = vlc_tbl1[(c_q0 << 8) + (rho[0] << 4) + eps0]; + vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7); + + if (c_q0 == 0) + mel_encode(&mel, rho[0] != 0); + + int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0; + ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m); + m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0; + ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m); + m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0; + ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m); + m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0; + ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m); + + if (x + 2 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] = 1; + e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = e_q[4]; + s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 2; + e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[5]); + s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 3 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 4; + e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[6]); + s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 8; + e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[7]); + s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + kappa = (rho[1] & (rho[1]-1)) ? ojph_max(1,max_e) : 1; + c_q1 |= ((rho[0] & 4) >> 1) | ((rho[0] & 8) >> 2); + int Uq1 = ojph_max(e_qmax[1], kappa); + u_q1 = Uq1 - kappa; + + int eps1 = 0; + if (u_q1 > 0) + { + eps1 |= (e_q[4] == e_qmax[1]); + eps1 |= (e_q[5] == e_qmax[1]) << 1; + eps1 |= (e_q[6] == e_qmax[1]) << 2; + eps1 |= (e_q[7] == e_qmax[1]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++; + max_e = ojph_max(lep[0], lep[1]) - 1; + lep[0] = (ui8)e_q[7]; + lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++; + c_q0 = lcxp[0] + (lcxp[1] << 2); + lcxp[0] = (ui8)((rho[1] & 8) >> 3); + ui16 tuple1 = vlc_tbl1[(c_q1 << 8) + (rho[1] << 4) + eps1]; + vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7); + + if (c_q1 == 0) + mel_encode(&mel, rho[1] != 0); + + int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0; + ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m); + m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0; + ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m); + m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0; + ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m); + m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0; + ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m); + } + + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len); + + //prepare for next iteration + c_q0 |= ((rho[1] & 4) >> 1) | ((rho[1] & 8) >> 2); + s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0; + e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0; + rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0; + } + } + + + terminate_mel_vlc(&mel, &vlc); + ms_terminate(&ms); + + //copy to elastic + lengths[0] = mel.pos + vlc.pos + ms.pos; + elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded); + memcpy(coded->buf, ms.buf, ms.pos); + memcpy(coded->buf + ms.pos, mel.buf, mel.pos); + memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos); + + // put in the interface locator word + ui32 num_bytes = mel.pos + vlc.pos; + coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4); + coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0; + coded->buf[lengths[0]-2] = + (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF)); + + coded->avail_size -= lengths[0]; } } } diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h index 947f25b0..15566820 100644 --- a/src/core/common/ojph_arch.h +++ b/src/core/common/ojph_arch.h @@ -166,6 +166,32 @@ namespace ojph { #endif } + ///////////////////////////////////////////////////////////////////////////// + static inline ui32 population_count64(ui64 val) + { + #if defined(OJPH_COMPILER_MSVC) \ + && (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) + return (ui32)__popcnt64(val); + #elif (defined OJPH_COMPILER_GNUC) + return (ui32)__builtin_popcountll(val); + #else + const ui64 k1 = 0x5555555555555555ull; + const ui64 k2 = 0x3333333333333333ull; + const ui64 k4 = 0x0F0F0F0F0F0F0F0Full; + const ui64 kf = 0x0101010101010101ull; + + // put count of each 2 bits into those 2 bits + val = val - ((val >> 1) & k1); + // put count of each 4 bits into those 4 bits + val = (val & k2) + ((val >> 2) & k2); + // put count of each 8 bits into those 8 bits + val = (val + (val >> 4)) & k4 ; + // returns 8 most significant bits of x + (x<<8) + (x<<16) + (x<<24) + ... + val = (val * kf) >> 56; + return (ui32) val; + #endif + } + ///////////////////////////////////////////////////////////////////////////// #ifdef OJPH_COMPILER_MSVC #pragma intrinsic(_BitScanReverse) @@ -188,6 +214,29 @@ namespace ojph { #endif } + ///////////////////////////////////////////////////////////////////////////// +#ifdef OJPH_COMPILER_MSVC + #pragma intrinsic(_BitScanReverse64) +#endif + static inline ui32 count_leading_zeros(ui64 val) + { + #ifdef OJPH_COMPILER_MSVC + unsigned long result = 0; + _BitScanReverse64(&result, val); + return 63 ^ (ui32)result; + #elif (defined OJPH_COMPILER_GNUC) + return (ui32)__builtin_clzll(val); + #else + val |= (val >> 1); + val |= (val >> 2); + val |= (val >> 4); + val |= (val >> 8); + val |= (val >> 16); + val |= (val >> 32); + return 64 - population_count64(val); + #endif + } + ///////////////////////////////////////////////////////////////////////////// #ifdef OJPH_COMPILER_MSVC #pragma intrinsic(_BitScanForward) @@ -247,17 +296,17 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// // finds the size such that it is a multiple of byte_alignment - template + template size_t calc_aligned_size(size_t size) { size = size * sizeof(T) + N - 1; size &= ~((1ULL << (31 - count_leading_zeros(N))) - 1); - size >>= (31 - count_leading_zeros(sizeof(T))); + size >>= (63 - count_leading_zeros(sizeof(T))); return size; } //////////////////////////////////////////////////////////////////////////// // moves the pointer to first address that is a multiple of byte_alignment - template + template inline T *align_ptr(T *ptr) { intptr_t p = reinterpret_cast(ptr); p += N - 1; diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp index 6a3aeaa0..32189e56 100644 --- a/src/core/transform/ojph_transform.cpp +++ b/src/core/transform/ojph_transform.cpp @@ -115,17 +115,17 @@ namespace ojph { #ifndef OJPH_DISABLE_SIMD - // #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) + #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) - // #ifndef OJPH_DISABLE_SSE - // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) - // { - // irv_vert_step = sse_irv_vert_step; - // irv_vert_times_K = sse_irv_vert_times_K; - // irv_horz_ana = sse_irv_horz_ana; - // irv_horz_syn = sse_irv_horz_syn; - // } - // #endif // !OJPH_DISABLE_SSE + #ifndef OJPH_DISABLE_SSE + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) + { + irv_vert_step = sse_irv_vert_step; + irv_vert_times_K = sse_irv_vert_times_K; + irv_horz_ana = sse_irv_horz_ana; + irv_horz_syn = sse_irv_horz_syn; + } + #endif // !OJPH_DISABLE_SSE // #ifndef OJPH_DISABLE_SSE2 // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) @@ -136,15 +136,15 @@ namespace ojph { // } // #endif // !OJPH_DISABLE_SSE2 - // #ifndef OJPH_DISABLE_AVX - // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX) - // { - // irv_vert_step = avx_irv_vert_step; - // irv_vert_times_K = avx_irv_vert_times_K; - // irv_horz_ana = avx_irv_horz_ana; - // irv_horz_syn = avx_irv_horz_syn; - // } - // #endif // !OJPH_DISABLE_AVX + #ifndef OJPH_DISABLE_AVX + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX) + { + irv_vert_step = avx_irv_vert_step; + irv_vert_times_K = avx_irv_vert_times_K; + irv_horz_ana = avx_irv_horz_ana; + irv_horz_syn = avx_irv_horz_syn; + } + #endif // !OJPH_DISABLE_AVX // #ifndef OJPH_DISABLE_AVX2 // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) @@ -155,23 +155,23 @@ namespace ojph { // } // #endif // !OJPH_DISABLE_AVX2 - // #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) - // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) - // { - // rev_vert_step = avx512_rev_vert_step; - // rev_horz_ana = avx512_rev_horz_ana; - // rev_horz_syn = avx512_rev_horz_syn; - - // irv_vert_step = avx512_irv_vert_step; - // irv_vert_times_K = avx512_irv_vert_times_K; - // irv_horz_ana = avx512_irv_horz_ana; - // irv_horz_syn = avx512_irv_horz_syn; - // } - // #endif // !OJPH_DISABLE_AVX512 + #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) + { + // rev_vert_step = avx512_rev_vert_step; + // rev_horz_ana = avx512_rev_horz_ana; + // rev_horz_syn = avx512_rev_horz_syn; + + irv_vert_step = avx512_irv_vert_step; + irv_vert_times_K = avx512_irv_vert_times_K; + irv_horz_ana = avx512_irv_horz_ana; + irv_horz_syn = avx512_irv_horz_syn; + } + #endif // !OJPH_DISABLE_AVX512 - // #elif defined(OJPH_ARCH_ARM) + #elif defined(OJPH_ARCH_ARM) - // #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) + #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) #endif // !OJPH_DISABLE_SIMD From 163f4269fe81fa80718554ae102c6cf7e6299578 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 1 Nov 2024 21:53:56 +1100 Subject: [PATCH 39/78] Version bump. --- src/core/common/ojph_version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h index 2f3adcc6..00faf755 100644 --- a/src/core/common/ojph_version.h +++ b/src/core/common/ojph_version.h @@ -34,5 +34,5 @@ //***************************************************************************/ #define OPENJPH_VERSION_MAJOR 0 -#define OPENJPH_VERSION_MINOR 17 +#define OPENJPH_VERSION_MINOR 18 #define OPENJPH_VERSION_PATCH 0 From c927c04ab21f322a69df1a14e629688364465317 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 1 Nov 2024 22:14:41 +1100 Subject: [PATCH 40/78] Changed fail-fast to false --- .github/workflows/ccp-workflow.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index da94fd23..449c1b78 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -10,6 +10,7 @@ on: jobs: build: strategy: + fail-fast: false matrix: include: [ { system: MacOS, runner: macos-latest }, @@ -29,6 +30,7 @@ jobs: build_windows: strategy: + fail-fast: false matrix: include: [ { system: Windows, runner: windows-latest }, @@ -47,6 +49,7 @@ jobs: test: strategy: matrix: + fail-fast: false include: [ { system: MacOS, runner: macos-latest }, { system: Ubuntu-latest, runner: ubuntu-latest }, @@ -67,6 +70,7 @@ jobs: test_windows: strategy: + fail-fast: false matrix: include: [ { system: Windows, runner: windows-latest }, From 358160743d23ed4513c3b4bcf40fdc5b7d1b15d6 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 1 Nov 2024 22:16:19 +1100 Subject: [PATCH 41/78] ojph_compress for pfm does not need bit_depth --- src/apps/ojph_compress/ojph_compress.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index e7c047a4..9dbdefb1 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -768,10 +768,6 @@ int main(int argc, char * argv[]) { assert(num_comps == 1 || num_comps == 3); siz.set_num_components(num_comps); - if (bit_depth[0] == 0) - OJPH_ERROR(0x01000091, - "-bit_depth must be specified (this is temporary only).\n"); - if (bit_depth[0] != 0) // one was set if (num_bit_depths < num_comps) // but if not enough, repeat for (ojph::ui32 c = num_bit_depths; c < num_comps; ++c) From 3992c83c6e70b9d3e10a2473a10a5c3a036de17d Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 2 Nov 2024 07:15:35 +1100 Subject: [PATCH 42/78] update to workflow --- .github/workflows/ccp-workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 449c1b78..3d170b54 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -48,8 +48,8 @@ jobs: test: strategy: + fail-fast: false matrix: - fail-fast: false include: [ { system: MacOS, runner: macos-latest }, { system: Ubuntu-latest, runner: ubuntu-latest }, From 075ee13d9883b5a11abe63584eb1d8a608cf617a Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 2 Nov 2024 07:32:48 +1100 Subject: [PATCH 43/78] Avoids warnings, and a bug fix -- macos not compiling. --- src/core/coding/ojph_block_decoder.cpp | 34 +++++++++++++------------- src/core/coding/ojph_block_encoder.cpp | 12 ++++----- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder.cpp index f2dd1c08..c5d4d610 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder.cpp @@ -459,7 +459,7 @@ namespace ojph { // accumulate in tmp, and increment bits, check if unstuffing is needed ui8 t = (vlcp->unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0; - val = val & (0xFFU >> t); // protect against erroneous 1 in MSB + val = (ui8)(val & (0xFFU >> t)); // protect against erroneous 1 in MSB vlcp->tmp |= (ui64)val << vlcp->bits; vlcp->bits += 8 - t; vlcp->unstuff = val > 0x8F; @@ -491,9 +491,9 @@ namespace ojph { // the first byte is treated different to other bytes, because only // the MSB nibble is part of the VLC code. - val = val >> 4; + val = (ui8)(val >> 4); ui8 t = ((val & 0x7) == 0x7) ? 1 : 0; // unstuffing is needed - val = val & (0xFU >> t); // protect against erroneous 1 in MSB + val = (ui8)(val & (0xFU >> t)); // protect against erroneous 1 in MSB vlcp->tmp = val; vlcp->bits = 4 - t; vlcp->unstuff = val > 0x8; //this is useful for the next byte @@ -787,7 +787,7 @@ namespace ojph { // unstuff and accumulate ui8 t = msp->unstuff ? 1 : 0; - val = val & (0xFFU >> t); + val = (ui8)(val & (0xFFU >> t)); msp->unstuff = (val == 0xFF); msp->tmp |= ((ui64)val) << msp->bits; // move data to msp->tmp msp->bits += 8 - t; @@ -2030,27 +2030,27 @@ namespace ojph { vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); uvlc_entry >>= 3; //extract suffixes for quad 0 and 1 - ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads - ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + ui32 len = uvlc_entry & 0xF; // suffix length for 2 quads + ui32 tmp = (ui32)(vlc_val&((1<>= 4; // quad 0 length len = uvlc_entry & 0x7; // quad 0 suffix length uvlc_entry >>= 3; ui16 u_q0 = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<> 3) + (tmp >> len)); //kappa == 1 + ui16 u_q1 = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa==1 // decode u_q extensions, which is needed only when u_q > 32 - ui32 u_ext; bool cond0, cond1; + ui16 u_ext; bool cond0, cond1; cond0 = u_q0 > 32; - u_ext = cond0 ? (uvlc_entry & 0xF) : 0; + u_ext = (ui16)(cond0 ? (uvlc_entry & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); u_q0 += u_ext << 2; sp[1] = u_q0; cond1 = u_q1 > 32; - u_ext = cond1 ? (uvlc_entry & 0xF) : 0; + u_ext = (ui16)(cond1 ? (uvlc_entry & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); - u_q0 += u_ext << 2; + u_q1 += u_ext << 2; sp[3] = u_q1; } sp[0] = sp[1] = 0; @@ -2152,8 +2152,8 @@ namespace ojph { vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); uvlc_entry >>= 3; //extract suffixes for quad 0 and 1 - ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads - ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = (ui32)(vlc_val&((1<>= 4; // quad 0 length @@ -2163,16 +2163,16 @@ namespace ojph { ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q // decode u_q extensions, which is needed only when u_q > 32 - ui32 u_ext; bool cond0, cond1; + ui16 u_ext; bool cond0, cond1; cond0 = u_q0 > 32; - u_ext = cond0 ? (uvlc_entry & 0xF) : 0; + u_ext = (ui16)(cond0 ? (uvlc_entry & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); u_q0 += u_ext << 2; sp[1] = u_q0; cond1 = u_q1 > 32; - u_ext = cond1 ? (uvlc_entry & 0xF) : 0; + u_ext = (ui16)(cond1 ? (uvlc_entry & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); - u_q0 += u_ext << 2; + u_q1 += u_ext << 2; sp[3] = u_q1; } sp[0] = sp[1] = 0; diff --git a/src/core/coding/ojph_block_encoder.cpp b/src/core/coding/ojph_block_encoder.cpp index 87e8810d..7758bd79 100644 --- a/src/core/coding/ojph_block_encoder.cpp +++ b/src/core/coding/ojph_block_encoder.cpp @@ -65,7 +65,7 @@ namespace ojph { static ui16 vlc_tbl1[2048] = { 0 }; //UVLC encoding - const int num_uvlc_entries = 74; + const int num_uvlc_entries = 75; struct uvlc_tbl_struct { ui8 pre, pre_len, suf, suf_len, ext, ext_len; }; @@ -234,19 +234,19 @@ namespace ojph { { uvlc_tbl[i].pre = 0; uvlc_tbl[i].pre_len = 3; - uvlc_tbl[i].suf = i - 5; + uvlc_tbl[i].suf = (ui8)(i - 5); uvlc_tbl[i].suf_len = 5; uvlc_tbl[i].ext = 0; uvlc_tbl[i].ext_len = 0; } - for (int i = 33; i < 75; ++i) + for (int i = 33; i < num_uvlc_entries; ++i) { uvlc_tbl[i].pre = 0; uvlc_tbl[i].pre_len = 3; - uvlc_tbl[i].suf = 28 + (i - 33) % 4; + uvlc_tbl[i].suf = (ui8)(28 + (i - 33) % 4); uvlc_tbl[i].suf_len = 5; - uvlc_tbl[i].ext = (i - 33) / 4; + uvlc_tbl[i].ext = (ui8)((i - 33) / 4); uvlc_tbl[i].ext_len = 4; } @@ -488,7 +488,7 @@ namespace ojph { if (msp->pos >= msp->buf_size) OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full"); int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len); - msp->tmp |= (cwd & ((1ULL << t) - 1)) << msp->used_bits; + msp->tmp |= (ui32)((cwd & ((1ULL << t) - 1)) << msp->used_bits); msp->used_bits += t; cwd >>= t; cwd_len -= t; From abd993dbc304784750d8670593750e9f317ff9aa Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 2 Nov 2024 07:40:01 +1100 Subject: [PATCH 44/78] This should compile on macos --- src/core/common/ojph_arch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h index 15566820..8292a686 100644 --- a/src/core/common/ojph_arch.h +++ b/src/core/common/ojph_arch.h @@ -300,7 +300,7 @@ namespace ojph { size_t calc_aligned_size(size_t size) { size = size * sizeof(T) + N - 1; size &= ~((1ULL << (31 - count_leading_zeros(N))) - 1); - size >>= (63 - count_leading_zeros(sizeof(T))); + size >>= (63 - count_leading_zeros((ui64)sizeof(T))); return size; } From 630217f485c8bb9b410de2e55f4a9ff6393ab80a Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 2 Nov 2024 21:09:13 +1100 Subject: [PATCH 45/78] Bug fixes. There are other bugs for 32 lossless. --- src/core/codestream/ojph_bitbuffer_write.h | 32 ++++++++-------------- src/core/codestream/ojph_params.cpp | 19 +++++++------ src/core/codestream/ojph_params_local.h | 3 +- src/core/codestream/ojph_precinct.cpp | 4 ++- src/core/coding/ojph_block_encoder.cpp | 15 +++++++++- 5 files changed, 42 insertions(+), 31 deletions(-) diff --git a/src/core/codestream/ojph_bitbuffer_write.h b/src/core/codestream/ojph_bitbuffer_write.h index d5b6bcac..ecb9dd20 100644 --- a/src/core/codestream/ojph_bitbuffer_write.h +++ b/src/core/codestream/ojph_bitbuffer_write.h @@ -109,33 +109,25 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline + void bb_put_zeros(bit_write_buf *bbp, int num_zeros, + mem_elastic_allocator *elastic, + coded_lists*& cur_coded_list, ui32& ph_bytes) + { + for (int i = num_zeros; i > 0; --i) + bb_put_bit(bbp, 0, elastic, cur_coded_list, ph_bytes); + } + ////////////////////////////////////////////////////////////////////////// static inline void bb_put_bits(bit_write_buf *bbp, ui32 data, int num_bits, mem_elastic_allocator *elastic, coded_lists*& cur_coded_list, ui32& ph_bytes) { -// assert(num_bits <= 32); - for (int i = num_bits - 1; i >= 0; --i) + assert(num_bits <= 32); + for (int i = num_bits - 1; i >= 0; --i) bb_put_bit(bbp, data >> i, elastic, cur_coded_list, ph_bytes); -// while (num_bits) { -// int tx_bits = num_bits < bbp->avail_bits ? num_bits : bbp->avail_bits; -// bbp->tmp |= (data >> (num_bits - tx_bits)) & ((1 << tx_bits) - 1); -// bbp->avail_bits -= tx_bits; -// if (bbp->avail_bits <= 0) -// { -// bbp->avail_bits = 8 - (bbp->tmp != 0xFF ? 0 : 1); -// bbp->buf[bbp->buf_size - bbp->avail_size] = (ui8)(bbp->tmp & 0xFF); -// bbp->tmp = 0; -// --bbp->avail_size; -// if (bbp->avail_size == 0) -// { -// bb_expand_buf(bbp, elastic, cur_coded_list->next_list); -// cur_coded_list = cur_coded_list->next_list; -// ph_bytes += bit_buffer::needed; -// } -// } -// } } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index 59839996..6895d1be 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -952,16 +952,16 @@ namespace ojph { int s = 0; double bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true); ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << 3); + u8_SPqcd[s++] = (ui8)((B + X) << reversible_SPqcd_shift); for (ui32 d = num_decomps; d > 0; --d) { double bibo_l = bibo_gains::get_bibo_gain_l(d, true); double bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true); X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << 3); - u8_SPqcd[s++] = (ui8)((B + X) << 3); + u8_SPqcd[s++] = (ui8)((B + X) << reversible_SPqcd_shift); + u8_SPqcd[s++] = (ui8)((B + X) << reversible_SPqcd_shift); X = (ui32) ceil(log(bibo_h * bibo_h) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << 3); + u8_SPqcd[s++] = (ui8)((B + X) << reversible_SPqcd_shift); } } @@ -1017,8 +1017,11 @@ namespace ojph { ui32 B = 0; int irrev = Sqcd & 0x1F; if (irrev == 0) //reversible - for (ui32 i = 0; i < num_subbands; ++i) - B = ojph_max(B, (u8_SPqcd[i] >> 3) + get_num_guard_bits() - 1u); + for (ui32 i = 0; i < num_subbands; ++i) { + ui32 t = (u8_SPqcd[i] >> reversible_SPqcd_shift); + t += get_num_guard_bits() - 1u; + B = ojph_max(B, t); + } else if (irrev == 2) //scalar expounded for (ui32 i = 0; i < num_subbands; ++i) { @@ -1088,9 +1091,9 @@ namespace ojph { } int irrev = Sqcd & 0x1F; - if (irrev == 0) //reversible; this is (10.22) from the J2K book + if (irrev == 0) // reversible; this is (10.22) from the J2K book { - num_bits += u8_SPqcd[idx] >> 3; + num_bits += u8_SPqcd[idx] >> reversible_SPqcd_shift; num_bits = num_bits == 0 ? 0 : num_bits - 1; } else if (irrev == 1) diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 8dc1b59c..0ebdd537 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -590,7 +590,7 @@ namespace ojph { { friend ::ojph::param_qcd; public: - param_qcd() + param_qcd() : reversible_SPqcd_shift(3) { Lqcd = 0; Sqcd = 0; @@ -646,6 +646,7 @@ namespace ojph { protected: ui16 Lqcd; ui8 Sqcd; + const ui8 reversible_SPqcd_shift; union { ui8 u8_SPqcd[97]; diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp index 813e33b8..803790d6 100644 --- a/src/core/codestream/ojph_precinct.cpp +++ b/src/core/codestream/ojph_precinct.cpp @@ -221,7 +221,9 @@ namespace ojph { { int num_zeros = *mmsb_tag.get(x>>levm1, y>>levm1, levm1); num_zeros -= *mmsb_tag.get(x>>cur_lev, y>>cur_lev, cur_lev); - bb_put_bits(&bb, 1, num_zeros + 1, + bb_put_zeros(&bb, num_zeros, + elastic, cur_coded_list, ph_bytes); + bb_put_bits(&bb, 1, 1, elastic, cur_coded_list, ph_bytes); *mmsb_tag_flags.get(x>>levm1, y>>levm1, levm1) = 1; } diff --git a/src/core/coding/ojph_block_encoder.cpp b/src/core/coding/ojph_block_encoder.cpp index 7758bd79..ffc9e8df 100644 --- a/src/core/coding/ojph_block_encoder.cpp +++ b/src/core/coding/ojph_block_encoder.cpp @@ -1021,8 +1021,21 @@ namespace ojph { { assert(num_passes == 1); (void)num_passes; //currently not used - const int ms_size = (16384*16+14)/15; //more than enough + // 38 bits/sample + 1 color + 4 wavelet = 43 bits per sample. + // * 4096 samples / 8 bits per byte = 22016; then rounded up to the + // nearest 1 kB, givin 22528. This expanded further to take into + // consideration stuffing at a max rate of 16 bits per 15 bits + // (1 bit for every 15 bits of data); in reality, it is much smaller + // than this. + const int ms_size = (22528 * 16 + 14) / 15; //more than enough ui8 ms_buf[ms_size]; + // For each quad, we need at most, 7 bits for VLC and 12 bits for UVLC. + // So we have 1024 quads * 19 / 8, which is 2432. This must be + // multiplied by 16 / 15 to accommodate stuffing. + // The mel is at most around 1 bit/quad, giving around 128 byte -- in + // practice there was on case where it got to 132 bytes. Even + // accounting for stuffing, it is smaller than 192. Therefore, + // 3072 is more than enough const int mel_vlc_size = 3072; //more than enough ui8 mel_vlc_buf[mel_vlc_size]; const int mel_size = 192; From 2b6fb153f1bf9c074fe33015c2cbdbc387b67751 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 2 Nov 2024 21:14:40 +1100 Subject: [PATCH 46/78] Reduce warnings. --- src/core/coding/ojph_block_decoder.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder.cpp index c5d4d610..cfa86974 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder.cpp @@ -1814,9 +1814,9 @@ namespace ojph { ui32 width, ui32 height, ui32 stride, bool stripe_causal) { - static bool insufficient_precision = false; - static bool modify_code = false; - static bool truncate_spp_mrp = false; + // static bool insufficient_precision = false; + // static bool modify_code = false; + // static bool truncate_spp_mrp = false; if (num_passes > 1 && lengths2 == 0) { @@ -2045,12 +2045,12 @@ namespace ojph { cond0 = u_q0 > 32; u_ext = (ui16)(cond0 ? (uvlc_entry & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); - u_q0 += u_ext << 2; + u_q0 = (ui16)(u_q0 + (u_ext << 2)); sp[1] = u_q0; cond1 = u_q1 > 32; u_ext = (ui16)(cond1 ? (uvlc_entry & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); - u_q1 += u_ext << 2; + u_q1 = (ui16)(u_q1 + (u_ext << 2)); sp[3] = u_q1; } sp[0] = sp[1] = 0; @@ -2167,12 +2167,12 @@ namespace ojph { cond0 = u_q0 > 32; u_ext = (ui16)(cond0 ? (uvlc_entry & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); - u_q0 += u_ext << 2; + u_q0 = (ui16)(u_q0 + (u_ext << 2)); sp[1] = u_q0; cond1 = u_q1 > 32; u_ext = (ui16)(cond1 ? (uvlc_entry & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); - u_q1 += u_ext << 2; + u_q1 = (ui16)(u_q1 + (u_ext << 2)); sp[3] = u_q1; } sp[0] = sp[1] = 0; From 8e1935ffffb3c76f43ed290b6509b96a56b1442d Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 3 Nov 2024 12:44:53 +1100 Subject: [PATCH 47/78] Bug fixes. 32 bit loss should be working now. MacOS may still have a problem. --- src/core/coding/ojph_block_common.cpp | 22 +++++++++++++++++++--- src/core/coding/ojph_block_common.h | 2 +- src/core/coding/ojph_block_decoder.cpp | 24 +++++++++++++----------- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/core/coding/ojph_block_common.cpp b/src/core/coding/ojph_block_common.cpp index e6b4de6a..2ba138a9 100644 --- a/src/core/coding/ojph_block_common.cpp +++ b/src/core/coding/ojph_block_common.cpp @@ -84,11 +84,20 @@ namespace ojph { * + 4 * mel event for initial row of quads when needed \n * \n * Each entry contains, starting from the LSB \n - * \li \c total prefix length for quads 0 and 1 (3 bits) \n - * \li \c total suffix length for quads 0 and 1 (4 bits) \n + * \li \c total total prefix length for quads 0 and 1 (3 bits) \n + * \li \c total total suffix length for quads 0 and 1 (4 bits) \n * \li \c suffix length for quad 0 (3 bits) \n * \li \c prefix for quad 0 (3 bits) \n * \li \c prefix for quad 1 (3 bits) \n + * \n + * Another table is uvlc_bias, which is needed to correctly decode the + * extension u_ext for initial row of quads. Under certain condition, + * we deduct 1 or 2 from u_q0 and u_q1 before encoding them; so for us + * to know that decoding u_ext is needed, we recreate the u_q0 and u_q1 + * that we actually encoded. \n + * For simplicity, we use the same index as before \n + * \li \c u_q0 bias is 2 bits \n + * \li \c u_q1 bias is 2 bits \n */ /// @brief uvlc_tbl0 contains decoding information for initial row of quads @@ -96,6 +105,8 @@ namespace ojph { /// @brief uvlc_tbl1 contains decoding information for non-initial row of /// quads ui16 uvlc_tbl1[256] = { 0 }; + /// @brief uvlc_bias contains decoding info. for initial row of quads + ui8 uvlc_bias[256+64] = { 0 }; /// @} //************************************************************************/ @@ -199,8 +210,10 @@ namespace ojph { ui32 mode = i >> 6; ui32 vlc = i & 0x3F; - if (mode == 0) // both u_off are 0 + if (mode == 0) { // both u_off are 0 uvlc_tbl0[i] = 0; + uvlc_bias[i] = 0; + } else if (mode <= 2) // u_off are either 01 or 10 { ui32 d = dec[vlc & 0x7]; //look at the least significant 3 bits @@ -232,6 +245,7 @@ namespace ojph { total_suffix = u0_suffix_len; u0 = d0 >> 5; u1 = (vlc & 1) + 1; + uvlc_bias[i] = 4; // 0b00 for u0 and 0b01 for u1 } else { @@ -240,6 +254,7 @@ namespace ojph { total_suffix = u0_suffix_len + ((d1 >> 2) & 0x7); u0 = d0 >> 5; u1 = d1 >> 5; + uvlc_bias[i] = 0; } uvlc_tbl0[i] = (ui16)(total_prefix | @@ -265,6 +280,7 @@ namespace ojph { (u0_suffix_len << 7) | (u0 << 10) | (u1 << 13)); + uvlc_bias[i] = 10; // 0b10 for u0 and 0b10 for u1 } } diff --git a/src/core/coding/ojph_block_common.h b/src/core/coding/ojph_block_common.h index 29a84bad..f8d65032 100644 --- a/src/core/coding/ojph_block_common.h +++ b/src/core/coding/ojph_block_common.h @@ -44,6 +44,6 @@ namespace ojph{ extern ui16 vlc_tbl1[1024]; extern ui16 uvlc_tbl0[256+64]; extern ui16 uvlc_tbl1[256]; - + extern ui8 uvlc_bias[256+64]; } // !namespace local } // !namespace ojph diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder.cpp index cfa86974..479926b0 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder.cpp @@ -2025,7 +2025,9 @@ namespace ojph { // run = mel_get_run(&mel); // get another run //decode uvlc_mode to get u for both quads - ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)]; + ui32 idx = uvlc_mode + (ui32)(vlc_val & 0x3F); + ui32 uvlc_entry = uvlc_tbl0[idx]; + ui16 u_bias = uvlc_bias[idx]; //remove total prefix length vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); uvlc_entry >>= 3; @@ -2037,21 +2039,21 @@ namespace ojph { // quad 0 length len = uvlc_entry & 0x7; // quad 0 suffix length uvlc_entry >>= 3; - ui16 u_q0 = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<> 3) + (tmp >> len)); //kappa==1 + ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); + ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // decode u_q extensions, which is needed only when u_q > 32 ui16 u_ext; bool cond0, cond1; - cond0 = u_q0 > 32; - u_ext = (ui16)(cond0 ? (uvlc_entry & 0xF) : 0); + cond0 = u_q0 - (u_bias & 0x3) > 32; + u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); u_q0 = (ui16)(u_q0 + (u_ext << 2)); - sp[1] = u_q0; - cond1 = u_q1 > 32; - u_ext = (ui16)(cond1 ? (uvlc_entry & 0xF) : 0); + sp[1] = u_q0 + 1; // kappa = 1 + cond1 = u_q1 - (u_bias >> 2) > 32; + u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); u_q1 = (ui16)(u_q1 + (u_ext << 2)); - sp[3] = u_q1; + sp[3] = u_q1 + 1; // kappa = 1 } sp[0] = sp[1] = 0; @@ -2165,12 +2167,12 @@ namespace ojph { // decode u_q extensions, which is needed only when u_q > 32 ui16 u_ext; bool cond0, cond1; cond0 = u_q0 > 32; - u_ext = (ui16)(cond0 ? (uvlc_entry & 0xF) : 0); + u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); u_q0 = (ui16)(u_q0 + (u_ext << 2)); sp[1] = u_q0; cond1 = u_q1 > 32; - u_ext = (ui16)(cond1 ? (uvlc_entry & 0xF) : 0); + u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); u_q1 = (ui16)(u_q1 + (u_ext << 2)); sp[3] = u_q1; From 6fb5add23c4c230135b9b3bac775978cd6f2c1de Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 3 Nov 2024 12:46:41 +1100 Subject: [PATCH 48/78] Addresses one warning. --- src/core/coding/ojph_block_decoder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder.cpp index 479926b0..51aaae0b 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder.cpp @@ -2048,12 +2048,12 @@ namespace ojph { u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); u_q0 = (ui16)(u_q0 + (u_ext << 2)); - sp[1] = u_q0 + 1; // kappa = 1 + sp[1] = u_q0 + (ui16)1; // kappa = 1 cond1 = u_q1 - (u_bias >> 2) > 32; u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); u_q1 = (ui16)(u_q1 + (u_ext << 2)); - sp[3] = u_q1 + 1; // kappa = 1 + sp[3] = u_q1 + (ui16)1; // kappa = 1 } sp[0] = sp[1] = 0; From 8558809dc1fc15a777753dc54cd4a426f9247c78 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 3 Nov 2024 12:48:05 +1100 Subject: [PATCH 49/78] Another attempt to address warning. --- src/core/coding/ojph_block_decoder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder.cpp index 51aaae0b..3ee6ca26 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder.cpp @@ -2048,12 +2048,12 @@ namespace ojph { u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); u_q0 = (ui16)(u_q0 + (u_ext << 2)); - sp[1] = u_q0 + (ui16)1; // kappa = 1 + sp[1] = (ui16)(u_q0 + 1); // kappa = 1 cond1 = u_q1 - (u_bias >> 2) > 32; u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0); vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); u_q1 = (ui16)(u_q1 + (u_ext << 2)); - sp[3] = u_q1 + (ui16)1; // kappa = 1 + sp[3] = (ui16)(u_q1 + 1); // kappa = 1 } sp[0] = sp[1] = 0; From eafb96511b4d8f5d448a5f490cf6446594cfb4f4 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 3 Nov 2024 13:13:21 +1100 Subject: [PATCH 50/78] Address another warning --- src/core/codestream/ojph_tile.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 11d7b406..4755bb44 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -258,7 +258,7 @@ namespace ojph { line_buf *tc = comps[comp_num].get_line(); if (reversible) { - si64 shift = 1LL << (num_bits[comp_num] - 1); + si64 shift = (si64)1 << (num_bits[comp_num] - 1); if (is_signed[comp_num] && nlt_type3[comp_num]) rev_convert_nlt_type3(line, line_offsets[comp_num], tc, 0, shift + 1, comp_width); @@ -282,7 +282,7 @@ namespace ojph { } else { - si64 shift = 1LL << (num_bits[comp_num] - 1); + si64 shift = (si64)1 << (num_bits[comp_num] - 1); ui32 comp_width = comp_rects[comp_num].siz.w; if (reversible) { @@ -346,7 +346,7 @@ namespace ojph { ui32 comp_width = recon_comp_rects[comp_num].siz.w; if (reversible) { - si64 shift = 1LL << (num_bits[comp_num] - 1); + si64 shift = (si64)1 << (num_bits[comp_num] - 1); if (is_signed[comp_num] && nlt_type3[comp_num]) rev_convert_nlt_type3(src_line, 0, tgt_line, line_offsets[comp_num], shift + 1, comp_width); @@ -384,7 +384,7 @@ namespace ojph { } if (reversible) { - si64 shift = 1LL << (num_bits[comp_num] - 1); + si64 shift = (si64)1 << (num_bits[comp_num] - 1); line_buf* src_line; if (comp_num < 3) src_line = lines + comp_num; From 5f99c89698229895f2c91cc49ef28376fd49a767 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 18:24:27 +1100 Subject: [PATCH 51/78] Most of the SIMD is done. No wasm yet. --- src/core/codestream/ojph_codeblock.cpp | 6 +- src/core/codestream/ojph_codeblock_fun.cpp | 41 +- src/core/codestream/ojph_codeblock_fun.h | 6 +- src/core/codestream/ojph_codestream_avx.cpp | 2 +- src/core/codestream/ojph_codestream_avx2.cpp | 79 ++- src/core/codestream/ojph_codestream_gen.cpp | 12 +- src/core/codestream/ojph_codestream_sse.cpp | 3 +- src/core/codestream/ojph_codestream_sse2.cpp | 85 ++- src/core/codestream/ojph_resolution.cpp | 36 +- src/core/coding/ojph_block_encoder_avx2.cpp | 7 +- src/core/coding/ojph_block_encoder_avx512.cpp | 10 +- src/core/transform/ojph_colour.cpp | 10 +- src/core/transform/ojph_colour_local.h | 43 +- src/core/transform/ojph_colour_sse2.cpp | 354 ++++++++-- src/core/transform/ojph_transform.cpp | 38 +- src/core/transform/ojph_transform_avx.cpp | 74 +- src/core/transform/ojph_transform_avx2.cpp | 656 +++++++++++++++++- src/core/transform/ojph_transform_local.h | 76 +- src/core/transform/ojph_transform_sse.cpp | 4 +- src/core/transform/ojph_transform_sse2.cpp | 572 ++++++++++++++- 20 files changed, 1834 insertions(+), 280 deletions(-) diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp index 53d9a6b1..bd76fb3f 100644 --- a/src/core/codestream/ojph_codeblock.cpp +++ b/src/core/codestream/ojph_codeblock.cpp @@ -245,7 +245,7 @@ namespace ojph { cb_size.w); } else - this->codeblock_functions.mem_clear32(dp, cb_size.w * sizeof(ui32)); + this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(ui32)); } else { @@ -259,9 +259,7 @@ namespace ojph { cb_size.w); } else - this->codeblock_functions.mem_clear64(dp, cb_size.w * sizeof(*dp)); - - + this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp)); } ++cur_line; diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index 4474428f..c0b70dc9 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -57,15 +57,10 @@ namespace ojph { { ////////////////////////////////////////////////////////////////////////// - void gen_mem_clear32(si32* addr, size_t count); - void sse_mem_clear32(si32* addr, size_t count); - void avx_mem_clear32(si32* addr, size_t count); - void wasm_mem_clear32(si32* addr, size_t count); - - void gen_mem_clear64(si64* addr, size_t count); - void sse_mem_clear64(si64* addr, size_t count); - void avx_mem_clear64(si64* addr, size_t count); - void wasm_mem_clear64(si64* addr, size_t count); + void gen_mem_clear(void* addr, size_t count); + void sse_mem_clear(void* addr, size_t count); + void avx_mem_clear(void* addr, size_t count); + void wasm_mem_clear(void* addr, size_t count); ////////////////////////////////////////////////////////////////////////// ui32 gen_find_max_val32(ui32* address); @@ -135,7 +130,7 @@ namespace ojph { // Default path, no acceleration. We may change this later decode_cb32 = ojph_decode_codeblock32; find_max_val32 = gen_find_max_val32; - mem_clear32 = gen_mem_clear32; + mem_clear = gen_mem_clear; if (reversible) { tx_to_cb32 = gen_rev_tx_to_cb32; tx_from_cb32 = gen_rev_tx_from_cb32; @@ -149,7 +144,6 @@ namespace ojph { decode_cb64 = ojph_decode_codeblock64; find_max_val64 = gen_find_max_val64; - mem_clear64 = gen_mem_clear64; if (reversible) { tx_to_cb64 = gen_rev_tx_to_cb64; tx_from_cb64 = gen_rev_tx_from_cb64; @@ -168,7 +162,7 @@ namespace ojph { // Accelerated functions for INTEL/AMD CPUs #ifndef OJPH_DISABLE_SSE if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) - mem_clear32 = sse_mem_clear32; + mem_clear = sse_mem_clear; #endif // !OJPH_DISABLE_SSE #ifndef OJPH_DISABLE_SSE2 @@ -182,6 +176,16 @@ namespace ojph { tx_to_cb32 = sse2_irv_tx_to_cb32; tx_from_cb32 = sse2_irv_tx_from_cb32; } + find_max_val64 = sse2_find_max_val64; + if (reversible) { + tx_to_cb64 = sse2_rev_tx_to_cb64; + tx_from_cb64 = sse2_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; + } } #endif // !OJPH_DISABLE_SSE2 @@ -192,7 +196,7 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX) - mem_clear32 = avx_mem_clear32; + mem_clear = avx_mem_clear; #endif // !OJPH_DISABLE_AVX #ifndef OJPH_DISABLE_AVX2 @@ -208,6 +212,17 @@ namespace ojph { } encode_cb32 = ojph_encode_codeblock_avx2; decode_cb32 = ojph_decode_codeblock_avx2; + + find_max_val64 = avx2_find_max_val64; + if (reversible) { + tx_to_cb64 = avx2_rev_tx_to_cb64; + tx_from_cb64 = avx2_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; + } } #endif // !OJPH_DISABLE_AVX2 diff --git a/src/core/codestream/ojph_codeblock_fun.h b/src/core/codestream/ojph_codeblock_fun.h index 03b3b243..67fbc2b7 100644 --- a/src/core/codestream/ojph_codeblock_fun.h +++ b/src/core/codestream/ojph_codeblock_fun.h @@ -48,8 +48,7 @@ namespace ojph { namespace local { // define function signature simple memory clearing - typedef void (*mem_clear_fun32)(si32* addr, size_t count); - typedef void (*mem_clear_fun64)(si64* addr, size_t count); + typedef void (*mem_clear_fun)(void* addr, size_t count); // define function signature for max value finding typedef ui32 (*find_max_val_fun32)(ui32* addr); @@ -96,8 +95,7 @@ namespace ojph { void init(bool reversible); // a pointer to the max value finding function - mem_clear_fun32 mem_clear32; - mem_clear_fun64 mem_clear64; + mem_clear_fun mem_clear; // a pointer to the max value finding function find_max_val_fun32 find_max_val32; diff --git a/src/core/codestream/ojph_codestream_avx.cpp b/src/core/codestream/ojph_codestream_avx.cpp index 22405c7e..4c6d678d 100644 --- a/src/core/codestream/ojph_codestream_avx.cpp +++ b/src/core/codestream/ojph_codestream_avx.cpp @@ -42,7 +42,7 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void avx_mem_clear32(si32* addr, size_t count) + void avx_mem_clear(void* addr, size_t count) { float* p = (float*)addr; __m256 zero = _mm256_setzero_ps(); diff --git a/src/core/codestream/ojph_codestream_avx2.cpp b/src/core/codestream/ojph_codestream_avx2.cpp index bd849b59..c01e0718 100644 --- a/src/core/codestream/ojph_codestream_avx2.cpp +++ b/src/core/codestream/ojph_codestream_avx2.cpp @@ -55,6 +55,18 @@ namespace ojph { return t; } + ////////////////////////////////////////////////////////////////////////// + ui64 avx2_find_max_val64(ui64* address) + { + __m128i x0 = _mm_loadu_si128((__m128i*)address); + __m128i x1 = _mm_loadu_si128((__m128i*)address + 1); + x0 = _mm_or_si128(x0, x1); + x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3] + x0 = _mm_or_si128(x0, x1); + ui64 t = (ui64)_mm_extract_epi64(x0, 0); + return t; + } + ////////////////////////////////////////////////////////////////////////// void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32* max_val) @@ -78,7 +90,7 @@ namespace ojph { } _mm256_storeu_si256((__m256i*)max_val, tmax); } - + ////////////////////////////////////////////////////////////////////////// void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32* max_val) @@ -115,11 +127,11 @@ namespace ojph { si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8) { - __m256i v = _mm256_load_si256((__m256i*)sp); - __m256i val = _mm256_and_si256(v, m1); - val = _mm256_srli_epi32(val, (int)shift); - val = _mm256_sign_epi32(val, v); - _mm256_storeu_si256((__m256i*)p, val); + __m256i v = _mm256_load_si256((__m256i*)sp); + __m256i val = _mm256_and_si256(v, m1); + val = _mm256_srli_epi32(val, (int)shift); + val = _mm256_sign_epi32(val, v); + _mm256_storeu_si256((__m256i*)p, val); } } @@ -142,5 +154,58 @@ namespace ojph { _mm256_storeu_ps(p, valf); } } + + ////////////////////////////////////////////////////////////////////////// + void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + __m256i m0 = _mm256_set1_epi64x(0x8000000000000000LL); + __m256i zero = _mm256_setzero_si256(); + __m256i one = _mm256_set1_epi64x(1); + __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); + __m256i *p = (__m256i*)sp; + for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4) + { + __m256i v = _mm256_loadu_si256(p); + __m256i sign = _mm256_cmpgt_epi64(zero, v); + __m256i val = _mm256_xor_si256(v, sign); // negate 1's complement + __m256i ones = _mm256_and_si256(sign, one); + val = _mm256_add_epi64(val, ones); // 2's complement + sign = _mm256_and_si256(sign, m0); + val = _mm256_slli_epi64(val, (int)shift); + tmax = _mm256_or_si256(tmax, val); + val = _mm256_or_si256(val, sign); + _mm256_storeu_si256((__m256i*)dp, val); + } + _mm256_storeu_si256((__m256i*)max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + + ui32 shift = 63 - K_max; + __m256i m1 = _mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL); + __m256i zero = _mm256_setzero_si256(); + __m256i one = _mm256_set1_epi64x(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) + { + __m256i v = _mm256_load_si256((__m256i*)sp); + __m256i val = _mm256_and_si256(v, m1); + val = _mm256_srli_epi64(val, (int)shift); + __m256i sign = _mm256_cmpgt_epi64(zero, v); + val = _mm256_xor_si256(val, sign); // negate 1's complement + __m256i ones = _mm256_and_si256(sign, one); + val = _mm256_add_epi64(val, ones); // 2's complement + _mm256_storeu_si256((__m256i*)p, val); + } + } } -} \ No newline at end of file +} diff --git a/src/core/codestream/ojph_codestream_gen.cpp b/src/core/codestream/ojph_codestream_gen.cpp index 50fc878d..cdc72c6e 100644 --- a/src/core/codestream/ojph_codestream_gen.cpp +++ b/src/core/codestream/ojph_codestream_gen.cpp @@ -42,17 +42,11 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void gen_mem_clear32(si32* addr, size_t count) - { - for (size_t i = 0; i < count; i += 4) - *addr++ = 0; - } - - ////////////////////////////////////////////////////////////////////////// - void gen_mem_clear64(si64* addr, size_t count) + void gen_mem_clear(void* addr, size_t count) { + si64* p = (si64*)addr; for (size_t i = 0; i < count; i += 8) - *addr++ = 0; + *p++ = 0; } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/codestream/ojph_codestream_sse.cpp b/src/core/codestream/ojph_codestream_sse.cpp index 99082aaa..6a31cbd6 100644 --- a/src/core/codestream/ojph_codestream_sse.cpp +++ b/src/core/codestream/ojph_codestream_sse.cpp @@ -42,13 +42,12 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void sse_mem_clear32(si32* addr, size_t count) + void sse_mem_clear(void* addr, size_t count) { float* p = (float*)addr; __m128 zero = _mm_setzero_ps(); for (size_t i = 0; i < count; i += 16, p += 4) _mm_storeu_ps(p, zero); } - } } \ No newline at end of file diff --git a/src/core/codestream/ojph_codestream_sse2.cpp b/src/core/codestream/ojph_codestream_sse2.cpp index 145db822..738f24b0 100644 --- a/src/core/codestream/ojph_codestream_sse2.cpp +++ b/src/core/codestream/ojph_codestream_sse2.cpp @@ -58,6 +58,21 @@ namespace ojph { // return t; } + ////////////////////////////////////////////////////////////////////////// + ui64 sse2_find_max_val64(ui64* address) + { + __m128i x1, x0 = _mm_loadu_si128((__m128i*)address); + x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3] + x0 = _mm_or_si128(x0, x1); + _mm_storeu_si128((__m128i*)address, x0); + return *address; + // A single movd t, xmm0 can do the trick, but it is not available + // in SSE2 intrinsics. extract_epi32 is available in sse4.1 + // ui32 t = (ui32)_mm_extract_epi16(x0, 0); + // t |= (ui32)_mm_extract_epi16(x0, 1) << 16; + // return t; + } + ////////////////////////////////////////////////////////////////////////// void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32* max_val) @@ -129,14 +144,14 @@ namespace ojph { si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) { - __m128i v = _mm_load_si128((__m128i*)sp); - __m128i val = _mm_and_si128(v, m1); - val = _mm_srli_epi32(val, (int)shift); - __m128i sign = _mm_cmplt_epi32(v, zero); - val = _mm_xor_si128(val, sign); // negate 1's complement - __m128i ones = _mm_and_si128(sign, one); - val = _mm_add_epi32(val, ones); // 2's complement - _mm_storeu_si128((__m128i*)p, val); + __m128i v = _mm_load_si128((__m128i*)sp); + __m128i val = _mm_and_si128(v, m1); + val = _mm_srli_epi32(val, (int)shift); + __m128i sign = _mm_cmplt_epi32(v, zero); + val = _mm_xor_si128(val, sign); // negate 1's complement + __m128i ones = _mm_and_si128(sign, one); + val = _mm_add_epi32(val, ones); // 2's complement + _mm_storeu_si128((__m128i*)p, val); } } @@ -159,5 +174,59 @@ namespace ojph { _mm_storeu_ps(p, valf); } } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + __m128i m0 = _mm_set1_epi64x(0x8000000000000000LL); + __m128i zero = _mm_setzero_si128(); + __m128i one = _mm_set1_epi64x(1); + __m128i tmax = _mm_loadu_si128((__m128i*)max_val); + __m128i *p = (__m128i*)sp; + for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2) + { + __m128i v = _mm_loadu_si128(p); + __m128i sign = _mm_cmplt_epi32(v, zero); + sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3]; + __m128i val = _mm_xor_si128(v, sign); // negate 1's complement + __m128i ones = _mm_and_si128(sign, one); + val = _mm_add_epi64(val, ones); // 2's complement + sign = _mm_and_si128(sign, m0); + val = _mm_slli_epi64(val, (int)shift); + tmax = _mm_or_si128(tmax, val); + val = _mm_or_si128(val, sign); + _mm_storeu_si128((__m128i*)dp, val); + } + _mm_storeu_si128((__m128i*)max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + __m128i m1 = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL); + __m128i zero = _mm_setzero_si128(); + __m128i one = _mm_set1_epi64x(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2) + { + __m128i v = _mm_load_si128((__m128i*)sp); + __m128i val = _mm_and_si128(v, m1); + val = _mm_srli_epi64(val, (int)shift); + __m128i sign = _mm_cmplt_epi32(v, zero); + sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3]; + val = _mm_xor_si128(val, sign); // negate 1's complement + __m128i ones = _mm_and_si128(sign, one); + val = _mm_add_epi64(val, ones); // 2's complement + _mm_storeu_si128((__m128i*)p, val); + } + } } } \ No newline at end of file diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp index fb4efdfe..bcb27c98 100644 --- a/src/core/codestream/ojph_resolution.cpp +++ b/src/core/codestream/ojph_resolution.cpp @@ -708,8 +708,8 @@ namespace ojph { rev_horz_syn(atk, aug->line, child_res->pull_line(), bands[1].pull_line(), width, horz_even); else - memcpy(aug->line->i32, child_res->pull_line()->i32, - width * sizeof(si32)); + memcpy(aug->line->p, child_res->pull_line()->p, + width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); aug->active = true; vert_even = !vert_even; ++cur_line; @@ -720,8 +720,8 @@ namespace ojph { rev_horz_syn(atk, sig->line, bands[2].pull_line(), bands[3].pull_line(), width, horz_even); else - memcpy(sig->line->i32, bands[2].pull_line()->i32, - width * sizeof(si32)); + memcpy(sig->line->p, bands[2].pull_line()->p, + width * (sig->line->flags & line_buf::LFT_SIZE_MASK)); sig->active = true; vert_even = !vert_even; ++cur_line; @@ -759,8 +759,8 @@ namespace ojph { rev_horz_syn(atk, aug->line, child_res->pull_line(), bands[1].pull_line(), width, horz_even); else - memcpy(aug->line->i32, child_res->pull_line()->i32, - width * sizeof(si32)); + memcpy(aug->line->p, child_res->pull_line()->p, + width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); } else { @@ -768,11 +768,21 @@ namespace ojph { rev_horz_syn(atk, aug->line, bands[2].pull_line(), bands[3].pull_line(), width, horz_even); else - memcpy(aug->line->i32, bands[2].pull_line()->i32, - width * sizeof(si32)); - si32* sp = aug->line->i32; - for (ui32 i = width; i > 0; --i) - *sp++ >>= 1; + memcpy(aug->line->p, bands[2].pull_line()->p, + width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); + if (aug->line->flags & line_buf::LFT_32BIT) + { + si32* sp = aug->line->i32; + for (ui32 i = width; i > 0; --i) + *sp++ >>= 1; + } + else + { + assert(aug->line->flags & line_buf::LFT_64BIT); + si64* sp = aug->line->i64; + for (ui32 i = width; i > 0; --i) + *sp++ >>= 1; + } } return aug->line; } @@ -880,8 +890,8 @@ namespace ojph { rev_horz_syn(atk, aug->line, child_res->pull_line(), bands[1].pull_line(), width, horz_even); else - memcpy(aug->line->i32, child_res->pull_line()->i32, - width * sizeof(si32)); + memcpy(aug->line->p, child_res->pull_line()->p, + width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); return aug->line; } else diff --git a/src/core/coding/ojph_block_encoder_avx2.cpp b/src/core/coding/ojph_block_encoder_avx2.cpp index d579f83a..6f3db34e 100644 --- a/src/core/coding/ojph_block_encoder_avx2.cpp +++ b/src/core/coding/ojph_block_encoder_avx2.cpp @@ -64,8 +64,8 @@ namespace ojph { // index is (c_q << 8) + (rho << 4) + eps // data is (cwd << 8) + (cwd_len << 4) + eps // table 0 is for the initial line of quads - static ui32 vlc_tbl0[2048] = { 0 }; - static ui32 vlc_tbl1[2048] = { 0 }; + static ui32 vlc_tbl0[2048]; + static ui32 vlc_tbl1[2048]; //UVLC encoding static ui32 ulvc_cwd_pre[33]; @@ -220,6 +220,9 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// bool initialize_tables_avx2() { if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { + memset(vlc_tbl0, 0, 2048 * sizeof(ui32)); + memset(vlc_tbl1, 0, 2048 * sizeof(ui32)); + bool result; result = vlc_init_tables(); result = result && uvlc_init_tables(); diff --git a/src/core/coding/ojph_block_encoder_avx512.cpp b/src/core/coding/ojph_block_encoder_avx512.cpp index 9df0e8ef..f0c7438b 100644 --- a/src/core/coding/ojph_block_encoder_avx512.cpp +++ b/src/core/coding/ojph_block_encoder_avx512.cpp @@ -64,8 +64,8 @@ namespace ojph { // index is (c_q << 8) + (rho << 4) + eps // data is (cwd << 8) + (cwd_len << 4) + eps // table 0 is for the initial line of quads - static ui32 vlc_tbl0[2048] = { 0 }; - static ui32 vlc_tbl1[2048] = { 0 }; + static ui32 vlc_tbl0[2048]; + static ui32 vlc_tbl1[2048]; //UVLC encoding static ui32 ulvc_cwd_pre[33]; @@ -219,7 +219,11 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// bool initialize_tables() { - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) { + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) + { + memset(vlc_tbl0, 0, 2048 * sizeof(ui32)); + memset(vlc_tbl1, 0, 2048 * sizeof(ui32)); + bool result; result = vlc_init_tables(); result = result && uvlc_init_tables(); diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index a72cd3d4..6289ae13 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -109,8 +109,6 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) - // cnvrt_si32_to_si32_shftd = gen_cnvrt_si32_to_si32_shftd; - // cnvrt_si32_to_si32_nlt_type3 = gen_cnvrt_si32_to_si32_nlt_type3; rev_convert = gen_rev_convert; rev_convert_nlt_type3 = gen_rev_convert_nlt_type3; cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd; @@ -141,12 +139,12 @@ namespace ojph { #ifndef OJPH_DISABLE_SSE2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) { + rev_convert = sse2_rev_convert; + rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3; cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd; cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32; - // cnvrt_si32_to_si32_shftd = sse2_cnvrt_si32_to_si32_shftd; - // cnvrt_si32_to_si32_nlt_type3 = sse2_cnvrt_si32_to_si32_nlt_type3; - // rct_forward = sse2_rct_forward; - // rct_backward = sse2_rct_backward; + rct_forward = sse2_rct_forward; + rct_backward = sse2_rct_backward; } #endif // !OJPH_DISABLE_SSE2 diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index 08e99a92..5314c53b 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -167,21 +167,26 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void sse2_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width); - + void sse2_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void sse2_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void sse2_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// // @@ -232,12 +237,14 @@ namespace ojph { int shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void avx2_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void avx2_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// // @@ -272,12 +279,14 @@ namespace ojph { int shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void wasm_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void wasm_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// void wasm_ict_forward(const float *r, const float *g, const float *b, diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index c50c091e..3829f6a5 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -39,6 +39,7 @@ #include "ojph_defs.h" #include "ojph_arch.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include @@ -46,6 +47,118 @@ namespace ojph { namespace local { + ///////////////////////////////////////////////////////////////////////// + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) + { + // note than m must be obtained using + // __m128i ve = _mm_set1_epi64x(1ULL << (63 - amt)); + __m128i x = _mm_srli_epi64(a, amt); + x = _mm_xor_si128(x, m); + __m128i result = _mm_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero) + { + __m128i s, t; + s = _mm_unpacklo_epi32(a, zero); // missing extended -ve + t = _mm_cmplt_epi32(a, zero); // get -ve + t = _mm_unpacklo_epi32(zero, t); + s = _mm_or_si128(t, s); // put -ve + return s; + } + + ////////////////////////////////////////////////////////////////////////// + static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero) + { + __m128i s, t; + s = _mm_unpackhi_epi32(a, zero); // missing extended -ve + t = _mm_cmplt_epi32(a, zero); // get -ve + t = _mm_unpackhi_epi32(zero, t); + s = _mm_or_si128(t, s); // put -ve + return s; + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + si32 s = (si32)shift; + for (ui32 i = width; i > 0; --i) + *dp++ = *sp++ + s; + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + for (ui32 i = width; i > 0; --i) + *dp++ = *sp++ + shift; + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + for (ui32 i = width; i > 0; --i) + *dp++ = (si32)(*sp++ + shift); + } + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + si32 s = (si32)shift; + for (ui32 i = width; i > 0; --i) { + const si32 v = *sp++; + *dp++ = v >= 0 ? v : (- v - s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + for (ui32 i = width; i > 0; --i) { + const si64 v = *sp++; + *dp++ = v >= 0 ? v : (- v - shift); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + for (ui32 i = width; i > 0; --i) { + const si64 v = *sp++; + *dp++ = (si32)(v >= 0 ? v : (- v - shift)); + } + } + } + ////////////////////////////////////////////////////////////////////////// void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width) @@ -80,80 +193,199 @@ namespace ojph { _MM_SET_ROUNDING_MODE(rounding_mode); } - ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void sse2_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) { - __m128i sh = _mm_set1_epi32(shift); - for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - __m128i s = _mm_loadu_si128((__m128i*)sp); - s = _mm_add_epi32(s, sh); - _mm_storeu_si128((__m128i*)dp, s); - } - } + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i mr = _mm_load_si128((__m128i*)rp); + __m128i mg = _mm_load_si128((__m128i*)gp); + __m128i mb = _mm_load_si128((__m128i*)bp); + __m128i t = _mm_add_epi32(mr, mb); + t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1)); + _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2)); + t = _mm_sub_epi32(mb, mg); + _mm_store_si128((__m128i*)cbp, t); + t = _mm_sub_epi32(mr, mg); + _mm_store_si128((__m128i*)crp, t); - ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, - int shift, ui32 width) - { - __m128i sh = _mm_set1_epi32(-shift); - __m128i zero = _mm_setzero_si128(); - for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + rp += 4; gp += 4; bp += 4; + yp += 4; cbp += 4; crp += 4; + } + } + else { - __m128i s = _mm_loadu_si128((__m128i*)sp); - __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value - __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value - v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value - s = _mm_andnot_si128(c, s); // keep only +ve or 0 - s = _mm_or_si128(s, v_m_sh); // combine - _mm_storeu_si128((__m128i*)dp, s); + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m128i zero = _mm_setzero_si128(); + __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i mr32 = _mm_load_si128((__m128i*)rp); + __m128i mg32 = _mm_load_si128((__m128i*)gp); + __m128i mb32 = _mm_load_si128((__m128i*)bp); + __m128i mr, mg, mb, t; + mr = sse2_cvtlo_epi32_epi64(mr32, zero); + mg = sse2_cvtlo_epi32_epi64(mg32, zero); + mb = sse2_cvtlo_epi32_epi64(mb32, zero); + + t = _mm_add_epi64(mr, mb); + t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1)); + _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2)); + t = _mm_sub_epi64(mb, mg); + _mm_store_si128((__m128i*)cbp, t); + t = _mm_sub_epi64(mr, mg); + _mm_store_si128((__m128i*)crp, t); + + yp += 2; cbp += 2; crp += 2; + + mr = sse2_cvthi_epi32_epi64(mr32, zero); + mg = sse2_cvthi_epi32_epi64(mg32, zero); + mb = sse2_cvthi_epi32_epi64(mb32, zero); + + t = _mm_add_epi64(mr, mb); + t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1)); + _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2)); + t = _mm_sub_epi64(mb, mg); + _mm_store_si128((__m128i*)cbp, t); + t = _mm_sub_epi64(mr, mg); + _mm_store_si128((__m128i*)crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 2; cbp += 2; crp += 2; + } } } ////////////////////////////////////////////////////////////////////////// - void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void sse2_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) { - for (int i = (repeat + 3) >> 2; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - __m128i mr = _mm_load_si128((__m128i*)r); - __m128i mg = _mm_load_si128((__m128i*)g); - __m128i mb = _mm_load_si128((__m128i*)b); - __m128i t = _mm_add_epi32(mr, mb); - t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1)); - _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2)); - t = _mm_sub_epi32(mb, mg); - _mm_store_si128((__m128i*)cb, t); - t = _mm_sub_epi32(mr, mg); - _mm_store_si128((__m128i*)cr, t); - - r += 4; g += 4; b += 4; - y += 4; cb += 4; cr += 4; - } - } + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i my = _mm_load_si128((__m128i*)yp); + __m128i mcb = _mm_load_si128((__m128i*)cbp); + __m128i mcr = _mm_load_si128((__m128i*)crp); - ////////////////////////////////////////////////////////////////////////// - void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) - { - for (int i = (repeat + 3) >> 2; i > 0; --i) + __m128i t = _mm_add_epi32(mcb, mcr); + t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2)); + _mm_store_si128((__m128i*)gp, t); + __m128i u = _mm_add_epi32(mcb, t); + _mm_store_si128((__m128i*)bp, u); + u = _mm_add_epi32(mcr, t); + _mm_store_si128((__m128i*)rp, u); + + yp += 4; cbp += 4; crp += 4; + rp += 4; gp += 4; bp += 4; + } + } + else { - __m128i my = _mm_load_si128((__m128i*)y); - __m128i mcb = _mm_load_si128((__m128i*)cb); - __m128i mcr = _mm_load_si128((__m128i*)cr); - - __m128i t = _mm_add_epi32(mcb, mcr); - t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2)); - _mm_store_si128((__m128i*)g, t); - __m128i u = _mm_add_epi32(mcb, t); - _mm_store_si128((__m128i*)b, u); - u = _mm_add_epi32(mcr, t); - _mm_store_si128((__m128i*)r, u); - - y += 4; cb += 4; cr += 4; - r += 4; g += 4; b += 4; + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); + __m128i low_bits = _mm_set_epi64x(0, 0xFFFFFFFFFFFFFFFFLL); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i my, mcb, mcr, tr, tg, tb; + my = _mm_load_si128((__m128i*)yp); + mcb = _mm_load_si128((__m128i*)cbp); + mcr = _mm_load_si128((__m128i*)crp); + + tg = _mm_add_epi64(mcb, mcr); + tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2)); + tb = _mm_add_epi64(mcb, tg); + tr = _mm_add_epi64(mcr, tg); + + __m128i mr, mg, mb; + mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0)); + mr = _mm_and_si128(low_bits, mr); + mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0)); + mg = _mm_and_si128(low_bits, mg); + mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0)); + mb = _mm_and_si128(low_bits, mb); + + yp += 2; cbp += 2; crp += 2; + + my = _mm_load_si128((__m128i*)yp); + mcb = _mm_load_si128((__m128i*)cbp); + mcr = _mm_load_si128((__m128i*)crp); + + tg = _mm_add_epi64(mcb, mcr); + tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2)); + tb = _mm_add_epi64(mcb, tg); + tr = _mm_add_epi64(mcr, tg); + + tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0)); + tr = _mm_andnot_si128(low_bits, tr); + mr = _mm_or_si128(mr, tr); + tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0)); + tg = _mm_andnot_si128(low_bits, tg); + mg = _mm_or_si128(mg, tg); + tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0)); + tb = _mm_andnot_si128(low_bits, tb); + mb = _mm_or_si128(mb, tb); + + _mm_store_si128((__m128i*)rp, mr); + _mm_store_si128((__m128i*)gp, mg); + _mm_store_si128((__m128i*)bp, mb); + + yp += 2; cbp += 2; crp += 2; + rp += 4; gp += 4; bp += 4; + } } } diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp index 32189e56..c4313ab2 100644 --- a/src/core/transform/ojph_transform.cpp +++ b/src/core/transform/ojph_transform.cpp @@ -127,14 +127,14 @@ namespace ojph { } #endif // !OJPH_DISABLE_SSE - // #ifndef OJPH_DISABLE_SSE2 - // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) - // { - // rev_vert_step = sse2_rev_vert_step; - // rev_horz_ana = sse2_rev_horz_ana; - // rev_horz_syn = sse2_rev_horz_syn; - // } - // #endif // !OJPH_DISABLE_SSE2 + #ifndef OJPH_DISABLE_SSE2 + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) + { + rev_vert_step = sse2_rev_vert_step; + rev_horz_ana = sse2_rev_horz_ana; + rev_horz_syn = sse2_rev_horz_syn; + } + #endif // !OJPH_DISABLE_SSE2 #ifndef OJPH_DISABLE_AVX if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX) @@ -146,14 +146,14 @@ namespace ojph { } #endif // !OJPH_DISABLE_AVX - // #ifndef OJPH_DISABLE_AVX2 - // if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) - // { - // rev_vert_step = avx2_rev_vert_step; - // rev_horz_ana = avx2_rev_horz_ana; - // rev_horz_syn = avx2_rev_horz_syn; - // } - // #endif // !OJPH_DISABLE_AVX2 + #ifndef OJPH_DISABLE_AVX2 + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) + { + rev_vert_step = avx2_rev_vert_step; + rev_horz_ana = avx2_rev_horz_ana; + rev_horz_syn = avx2_rev_horz_syn; + } + #endif // !OJPH_DISABLE_AVX2 #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) @@ -194,6 +194,7 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) ///////////////////////////////////////////////////////////////////////// + static void gen_rev_vert_step32(const lifting_step* s, const line_buf* sig, const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis) @@ -245,6 +246,7 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// + static void gen_rev_vert_step64(const lifting_step* s, const line_buf* sig, const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis) @@ -319,6 +321,7 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// + static void gen_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, const line_buf* hdst, const line_buf* src, ui32 width, bool even) @@ -397,6 +400,7 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// + static void gen_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, const line_buf* hdst, const line_buf* src, ui32 width, bool even) @@ -495,6 +499,7 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// + static void gen_rev_horz_syn32(const param_atk* atk, const line_buf* dst, const line_buf* lsrc, const line_buf* hsrc, ui32 width, bool even) @@ -573,6 +578,7 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// + static void gen_rev_horz_syn64(const param_atk* atk, const line_buf* dst, const line_buf* lsrc, const line_buf* hsrc, ui32 width, bool even) diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp index 08566624..4e5b82e7 100644 --- a/src/core/transform/ojph_transform_avx.cpp +++ b/src/core/transform/ojph_transform_avx.cpp @@ -61,6 +61,76 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline + void avx_deinterleave32(float* dpl, float* dph, float* sp, + int width, bool even) + { + if (even) + { + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); + } + } + else + { + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, f); + _mm256_store_ps(dph, e); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx_interleave32(float* dp, float* spl, float* sph, + int width, bool even) + { + if (even) + { + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + else + { + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(b, a); + __m256 d = _mm256_unpackhi_ps(b, a); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + } + ////////////////////////////////////////////////////////////////////////// void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, const line_buf* other, const line_buf* aug, @@ -104,7 +174,7 @@ namespace ojph { float* dph = hdst->f32; float* sp = src->f32; int w = (int)width; - AVX_DEINTERLEAVE(dpl, dph, sp, w, even); + avx_deinterleave32(dpl, dph, sp, w, even); } // the actual horizontal transform @@ -238,7 +308,7 @@ namespace ojph { float* spl = lsrc->f32; float* sph = hsrc->f32; int w = (int)width; - AVX_INTERLEAVE(dp, spl, sph, w, even); + avx_interleave32(dp, spl, sph, w, even); } } else { diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp index 847cd4c4..76a4dd71 100644 --- a/src/core/transform/ojph_transform_avx2.cpp +++ b/src/core/transform/ojph_transform_avx2.cpp @@ -52,13 +52,95 @@ namespace ojph { namespace local { ///////////////////////////////////////////////////////////////////////// - void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline + __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) + { + // note than m must be obtained using + // __m256i ve = _mm256_set1_epi64x(1ULL << (63 - amt)); + __m256i x = _mm256_srli_epi64(a, amt); + x = _mm256_xor_si256(x, m); + __m256i result = _mm256_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_interleave32(float* dp, float* spl, float* sph, int width) + { + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_deinterleave64(double* dpl, double* dph, double* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m256d a = _mm256_load_pd(sp); + __m256d b = _mm256_load_pd(sp + 4); + __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0)); + __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1)); + __m256d e = _mm256_shuffle_pd(c, d, 0x0); + __m256d f = _mm256_shuffle_pd(c, d, 0xF); + _mm256_store_pd(dpl, e); + _mm256_store_pd(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_interleave64(double* dp, double* spl, double* sph, int width) + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m256d a = _mm256_load_pd(spl); + __m256d b = _mm256_load_pd(sph); + __m256d c = _mm256_unpacklo_pd(a, b); + __m256d d = _mm256_unpackhi_pd(a, b); + __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0)); + __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1)); + _mm256_store_pd(dp, e); + _mm256_store_pd(dp + 4, f); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void avx2_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m256i va = _mm256_set1_epi32(a); __m256i vb = _mm256_set1_epi32(b); @@ -181,19 +263,174 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + static + void avx2_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m256i va = _mm256_set1_epi64x(a); + __m256i vb = _mm256_set1_epi64x(b); + __m256i ve = _mm256_set1_epi64x(1ULL << (63 - e)); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + else { // general case + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i u = _mm256_mullo_epi64(va, t); + __m256i v = _mm256_add_epi64(vb, u); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i u = _mm256_mullo_epi64(va, t); + __m256i v = _mm256_add_epi64(vb, u); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + avx2_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + avx2_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void avx2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { // combine both lsrc and hsrc into dst { - float* dpl = ldst->f32; - float* dph = hdst->f32; - float* sp = src->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; int w = (int)width; - AVX_DEINTERLEAVE(dpl, dph, sp, w, even); + avx2_deinterleave32(dpl, dph, sp, w); } si32* hp = hdst->i32, * lp = ldst->i32; @@ -206,7 +443,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m256i va = _mm256_set1_epi32(a); __m256i vb = _mm256_set1_epi32(b); @@ -346,11 +583,201 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } + + ///////////////////////////////////////////////////////////////////////// + static + void avx2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)src->p; + int w = (int)width; + avx2_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m256i va = _mm256_set1_epi64x(a); + __m256i vb = _mm256_set1_epi64x(b); + __m256i ve = _mm256_set1_epi64x(1ULL << (63 - e)); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else { + // general case + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i u = _mm256_mullo_epi64(va, t); + __m256i v = _mm256_add_epi64(vb, u); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i u = _mm256_mullo_epi64(va, t); + __m256i v = _mm256_add_epi64(vb, u); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + avx2_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + avx2_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, - ui32 width, bool even) + static + void avx2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -364,7 +791,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m256i va = _mm256_set1_epi32(a); __m256i vb = _mm256_set1_epi32(b); @@ -499,11 +926,11 @@ namespace ojph { // combine both lsrc and hsrc into dst { - float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - AVX_INTERLEAVE(dp, spl, sph, w, even); + avx2_interleave32(dp, spl, sph, w); } } else { @@ -514,5 +941,194 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static + void avx2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m256i va = _mm256_set1_epi64x(a); + __m256i vb = _mm256_set1_epi64x(b); + __m256i ve = _mm256_set1_epi64x(1ULL << (63 - e)); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else { + // general case + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i u = _mm256_mullo_epi64(va, t); + __m256i v = _mm256_add_epi64(vb, u); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i u = _mm256_mullo_epi64(va, t); + __m256i v = _mm256_add_epi64(vb, u); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + avx2_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + avx2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + avx2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h index c139ca00..5406124c 100644 --- a/src/core/transform/ojph_transform_local.h +++ b/src/core/transform/ojph_transform_local.h @@ -112,7 +112,7 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - #define SSE_DEINTERLEAVE(dpl, dph, sp, width, even) \ + #define SSE_DEINTERLEAVE32(dpl, dph, sp, width, even) \ { \ if (even) \ for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) \ @@ -134,10 +134,10 @@ namespace ojph { _mm_store_ps(dpl, d); \ _mm_store_ps(dph, c); \ } \ - } + } ////////////////////////////////////////////////////////////////////////// - #define SSE_INTERLEAVE(dp, spl, sph, width, even) \ + #define SSE_INTERLEAVE32(dp, spl, sph, width, even) \ { \ if (even) \ for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) \ @@ -219,76 +219,6 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - // Supporting macros - ////////////////////////////////////////////////////////////////////////// - - ////////////////////////////////////////////////////////////////////////// - #define AVX_DEINTERLEAVE(dpl, dph, sp, width, even) \ - { \ - if (even) \ - { \ - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) \ - { \ - __m256 a = _mm256_load_ps(sp); \ - __m256 b = _mm256_load_ps(sp + 8); \ - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); \ - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); \ - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm256_store_ps(dpl, e); \ - _mm256_store_ps(dph, f); \ - } \ - } \ - else \ - { \ - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) \ - { \ - __m256 a = _mm256_load_ps(sp); \ - __m256 b = _mm256_load_ps(sp + 8); \ - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); \ - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); \ - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm256_store_ps(dpl, f); \ - _mm256_store_ps(dph, e); \ - } \ - } \ - } - - ////////////////////////////////////////////////////////////////////////// - #define AVX_INTERLEAVE(dp, spl, sph, width, even) \ - { \ - if (even) \ - { \ - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) \ - { \ - __m256 a = _mm256_load_ps(spl); \ - __m256 b = _mm256_load_ps(sph); \ - __m256 c = _mm256_unpacklo_ps(a, b); \ - __m256 d = _mm256_unpackhi_ps(a, b); \ - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); \ - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); \ - _mm256_store_ps(dp, e); \ - _mm256_store_ps(dp + 8, f); \ - } \ - } \ - else \ - { \ - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) \ - { \ - __m256 a = _mm256_load_ps(spl); \ - __m256 b = _mm256_load_ps(sph); \ - __m256 c = _mm256_unpacklo_ps(b, a); \ - __m256 d = _mm256_unpackhi_ps(b, a); \ - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); \ - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); \ - _mm256_store_ps(dp, e); \ - _mm256_store_ps(dp + 8, f); \ - } \ - } \ - } - ////////////////////////////////////////////////////////////////////////// // Irreversible functions ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp index 897a1939..e878746d 100644 --- a/src/core/transform/ojph_transform_sse.cpp +++ b/src/core/transform/ojph_transform_sse.cpp @@ -104,7 +104,7 @@ namespace ojph { float* dph = hdst->f32; float* sp = src->f32; int w = (int)width; - SSE_DEINTERLEAVE(dpl, dph, sp, w, even); + SSE_DEINTERLEAVE32(dpl, dph, sp, w, even); } // the actual horizontal transform @@ -238,7 +238,7 @@ namespace ojph { float* spl = lsrc->f32; float* sph = hsrc->f32; int w = (int)width; - SSE_INTERLEAVE(dp, spl, sph, w, even); + SSE_INTERLEAVE32(dp, spl, sph, w, even); } } else { diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp index 8328842a..21e0409a 100644 --- a/src/core/transform/ojph_transform_sse2.cpp +++ b/src/core/transform/ojph_transform_sse2.cpp @@ -52,13 +52,80 @@ namespace ojph { namespace local { ///////////////////////////////////////////////////////////////////////// - void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) + { + // note than m must be obtained using + // __m128i ve = _mm_set1_epi64x(1ULL << (63 - amt)); + __m128i x = _mm_srli_epi64(a, amt); + x = _mm_xor_si128(x, m); + __m128i result = _mm_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_deinterleave64(double* dpl, double* dph, double* sp, + int width, bool even) + { + if (even) + for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) + { + __m128d a = _mm_load_pd(sp); + __m128d b = _mm_load_pd(sp + 2); + __m128d c = _mm_shuffle_pd(a, b, 0); + __m128d d = _mm_shuffle_pd(a, b, 3); + _mm_store_pd(dpl, c); + _mm_store_pd(dph, d); + } + else + for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) + { + __m128d a = _mm_load_pd(sp); + __m128d b = _mm_load_pd(sp + 2); + __m128d c = _mm_shuffle_pd(a, b, 0); + __m128d d = _mm_shuffle_pd(a, b, 3); + _mm_store_pd(dpl, d); + _mm_store_pd(dph, c); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_interleave64(double* dp, double* spl, double* sph, + int width, bool even) + { + if (even) + for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) + { + __m128d a = _mm_load_pd(spl); + __m128d b = _mm_load_pd(sph); + __m128d c = _mm_unpacklo_pd(a, b); + __m128d d = _mm_unpackhi_pd(a, b); + _mm_store_pd(dp, c); + _mm_store_pd(dp + 2, d); + } + else + for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) + { + __m128d a = _mm_load_pd(spl); + __m128d b = _mm_load_pd(sph); + __m128d c = _mm_unpacklo_pd(b, a); + __m128d d = _mm_unpackhi_pd(b, a); + _mm_store_pd(dp, c); + _mm_store_pd(dp + 2, d); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void sse2_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi32(b); si32* dst = aug->i32; @@ -162,9 +229,143 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + static + void sse2_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m128i vb = _mm_set1_epi64x(b); + __m128i ve = _mm_set1_epi64x(1ULL << (63 - e)); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + } + else { // general case + // 64bit multiplication is not supported in sse2 + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + } + + ///////////////////////////////////////////////////////////////////////// + void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void sse2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { @@ -174,7 +375,7 @@ namespace ojph { float* dph = hdst->f32; float* sp = src->f32; int w = (int)width; - SSE_DEINTERLEAVE(dpl, dph, sp, w, even); + SSE_DEINTERLEAVE32(dpl, dph, sp, w, even); } si32* hp = hdst->i32, * lp = ldst->i32; @@ -187,7 +388,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi32(b); // extension @@ -284,9 +485,7 @@ namespace ojph { } else { // general case - // 32bit multiplication is not supported in sse2; we need sse4.1, - // where we can use _mm_mullo_epi32, which multiplies - // 32bit x 32bit, keeping the LSBs + // 64bit multiplication is not supported in sse2. if (even) for (ui32 i = h_width; i > 0; --i, sp++, dp++) *dp += (b + a * (sp[0] + sp[1])) >> e; @@ -308,11 +507,181 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } + + ///////////////////////////////////////////////////////////////////////// + static + void sse2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + { + double* dpl = (double*)ldst->p; + double* dph = (double*)hdst->p; + double* sp = (double*)src->p; + int w = (int)width; + sse2_deinterleave64(dpl, dph, sp, w, even); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m128i vb = _mm_set1_epi64x(b); + __m128i ve = _mm_set1_epi64x(1ULL << (63 - e)); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else { + // general case + // 32bit multiplication is not supported in sse2; we need sse4.1, + // where we can use _mm_mullo_epi32, which multiplies + // 32bit x 32bit, keeping the LSBs + if (even) + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[0] + sp[1])) >> e; + else + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, - ui32 width, bool even) + void sse2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -326,7 +695,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi32(b); // extension @@ -446,7 +815,7 @@ namespace ojph { float* spl = lsrc->f32; float* sph = hsrc->f32; int w = (int)width; - SSE_INTERLEAVE(dp, spl, sph, w, even); + SSE_INTERLEAVE32(dp, spl, sph, w, even); } } else { @@ -457,5 +826,174 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m128i vb = _mm_set1_epi64x(b); + __m128i ve = _mm_set1_epi64x(1ULL << (63 - e)); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else { + // general case + // 32bit multiplication is not supported in sse2; we need sse4.1, + // where we can use _mm_mullo_epi32, which multiplies + // 32bit x 32bit, keeping the LSBs + if (ev) + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + else + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[0] + sp[1])) >> e; + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)lsrc->p; + double* sph = (double*)hsrc->p; + int w = (int)width; + sse2_interleave64(dp, spl, sph, w, even); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph From 1c297b286d2d1e3c22e9b1012815ba69c11854cd Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 18:42:23 +1100 Subject: [PATCH 52/78] A bug fix. --- src/core/transform/ojph_transform_avx2.cpp | 177 ++++++++++++--------- src/core/transform/ojph_transform_sse2.cpp | 10 +- 2 files changed, 105 insertions(+), 82 deletions(-) diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp index 76a4dd71..5f6e6419 100644 --- a/src/core/transform/ojph_transform_avx2.cpp +++ b/src/core/transform/ojph_transform_avx2.cpp @@ -363,33 +363,42 @@ namespace ojph { } } else { // general case - int i = (int)repeat; + // 64bit multiplication is not supported in avx2; + // in particular, _mm256_mullo_epi64. if (synthesis) - for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) - { - __m256i s1 = _mm256_load_si256((__m256i*)src1); - __m256i s2 = _mm256_load_si256((__m256i*)src2); - __m256i d = _mm256_load_si256((__m256i*)dst); - __m256i t = _mm256_add_epi64(s1, s2); - __m256i u = _mm256_mullo_epi64(va, t); - __m256i v = _mm256_add_epi64(vb, u); - __m256i w = avx2_mm256_srai_epi64(v, e, ve); - d = _mm256_sub_epi64(d, w); - _mm256_store_si256((__m256i*)dst, d); - } + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; else - for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) - { - __m256i s1 = _mm256_load_si256((__m256i*)src1); - __m256i s2 = _mm256_load_si256((__m256i*)src2); - __m256i d = _mm256_load_si256((__m256i*)dst); - __m256i t = _mm256_add_epi64(s1, s2); - __m256i u = _mm256_mullo_epi64(va, t); - __m256i v = _mm256_add_epi64(vb, u); - __m256i w = avx2_mm256_srai_epi64(v, e, ve); - d = _mm256_add_epi64(d, w); - _mm256_store_si256((__m256i*)dst, d); - } + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + + // int i = (int)repeat; + // if (synthesis) + // for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + // { + // __m256i s1 = _mm256_load_si256((__m256i*)src1); + // __m256i s2 = _mm256_load_si256((__m256i*)src2); + // __m256i d = _mm256_load_si256((__m256i*)dst); + // __m256i t = _mm256_add_epi64(s1, s2); + // __m256i u = _mm256_mullo_epi64(va, t); + // __m256i v = _mm256_add_epi64(vb, u); + // __m256i w = avx2_mm256_srai_epi64(v, e, ve); + // d = _mm256_sub_epi64(d, w); + // _mm256_store_si256((__m256i*)dst, d); + // } + // else + // for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + // { + // __m256i s1 = _mm256_load_si256((__m256i*)src1); + // __m256i s2 = _mm256_load_si256((__m256i*)src2); + // __m256i d = _mm256_load_si256((__m256i*)dst); + // __m256i t = _mm256_add_epi64(s1, s2); + // __m256i u = _mm256_mullo_epi64(va, t); + // __m256i v = _mm256_add_epi64(vb, u); + // __m256i w = avx2_mm256_srai_epi64(v, e, ve); + // d = _mm256_add_epi64(d, w); + // _mm256_store_si256((__m256i*)dst, d); + // } } } @@ -710,33 +719,42 @@ namespace ojph { } else { // general case - int i = (int)h_width; + // 64bit multiplication is not supported in avx2; + // in particular, _mm256_mullo_epi64. if (even) - for (; i > 0; i -= 4, sp += 4, dp += 4) - { - __m256i s1 = _mm256_load_si256((__m256i*)sp); - __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); - __m256i d = _mm256_load_si256((__m256i*)dp); - __m256i t = _mm256_add_epi64(s1, s2); - __m256i u = _mm256_mullo_epi64(va, t); - __m256i v = _mm256_add_epi64(vb, u); - __m256i w = avx2_mm256_srai_epi64(v, e, ve); - d = _mm256_add_epi64(d, w); - _mm256_store_si256((__m256i*)dp, d); - } + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[0] + sp[1])) >> e; else - for (; i > 0; i -= 4, sp += 4, dp += 4) - { - __m256i s1 = _mm256_load_si256((__m256i*)sp); - __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); - __m256i d = _mm256_load_si256((__m256i*)dp); - __m256i t = _mm256_add_epi64(s1, s2); - __m256i u = _mm256_mullo_epi64(va, t); - __m256i v = _mm256_add_epi64(vb, u); - __m256i w = avx2_mm256_srai_epi64(v, e, ve); - d = _mm256_add_epi64(d, w); - _mm256_store_si256((__m256i*)dp, d); - } + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + + // int i = (int)h_width; + // if (even) + // for (; i > 0; i -= 4, sp += 4, dp += 4) + // { + // __m256i s1 = _mm256_load_si256((__m256i*)sp); + // __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + // __m256i d = _mm256_load_si256((__m256i*)dp); + // __m256i t = _mm256_add_epi64(s1, s2); + // __m256i u = _mm256_mullo_epi64(va, t); + // __m256i v = _mm256_add_epi64(vb, u); + // __m256i w = avx2_mm256_srai_epi64(v, e, ve); + // d = _mm256_add_epi64(d, w); + // _mm256_store_si256((__m256i*)dp, d); + // } + // else + // for (; i > 0; i -= 4, sp += 4, dp += 4) + // { + // __m256i s1 = _mm256_load_si256((__m256i*)sp); + // __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + // __m256i d = _mm256_load_si256((__m256i*)dp); + // __m256i t = _mm256_add_epi64(s1, s2); + // __m256i u = _mm256_mullo_epi64(va, t); + // __m256i v = _mm256_add_epi64(vb, u); + // __m256i w = avx2_mm256_srai_epi64(v, e, ve); + // d = _mm256_add_epi64(d, w); + // _mm256_store_si256((__m256i*)dp, d); + // } } // swap buffers @@ -1058,33 +1076,42 @@ namespace ojph { } else { // general case - int i = (int)aug_width; + // 64bit multiplication is not supported in avx2; + // in particular, _mm_mullo_epi64. if (ev) - for (; i > 0; i -= 4, sp += 4, dp += 4) - { - __m256i s1 = _mm256_load_si256((__m256i*)sp); - __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); - __m256i d = _mm256_load_si256((__m256i*)dp); - __m256i t = _mm256_add_epi64(s1, s2); - __m256i u = _mm256_mullo_epi64(va, t); - __m256i v = _mm256_add_epi64(vb, u); - __m256i w = avx2_mm256_srai_epi64(v, e, ve); - d = _mm256_sub_epi64(d, w); - _mm256_store_si256((__m256i*)dp, d); - } + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; else - for (; i > 0; i -= 4, sp += 4, dp += 4) - { - __m256i s1 = _mm256_load_si256((__m256i*)sp); - __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); - __m256i d = _mm256_load_si256((__m256i*)dp); - __m256i t = _mm256_add_epi64(s1, s2); - __m256i u = _mm256_mullo_epi64(va, t); - __m256i v = _mm256_add_epi64(vb, u); - __m256i w = avx2_mm256_srai_epi64(v, e, ve); - d = _mm256_sub_epi64(d, w); - _mm256_store_si256((__m256i*)dp, d); - } + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[0] + sp[1])) >> e; + + // int i = (int)aug_width; + // if (ev) + // for (; i > 0; i -= 4, sp += 4, dp += 4) + // { + // __m256i s1 = _mm256_load_si256((__m256i*)sp); + // __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + // __m256i d = _mm256_load_si256((__m256i*)dp); + // __m256i t = _mm256_add_epi64(s1, s2); + // __m256i u = _mm256_mullo_epi64(va, t); + // __m256i v = _mm256_add_epi64(vb, u); + // __m256i w = avx2_mm256_srai_epi64(v, e, ve); + // d = _mm256_sub_epi64(d, w); + // _mm256_store_si256((__m256i*)dp, d); + // } + // else + // for (; i > 0; i -= 4, sp += 4, dp += 4) + // { + // __m256i s1 = _mm256_load_si256((__m256i*)sp); + // __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + // __m256i d = _mm256_load_si256((__m256i*)dp); + // __m256i t = _mm256_add_epi64(s1, s2); + // __m256i u = _mm256_mullo_epi64(va, t); + // __m256i v = _mm256_add_epi64(vb, u); + // __m256i w = avx2_mm256_srai_epi64(v, e, ve); + // d = _mm256_sub_epi64(d, w); + // _mm256_store_si256((__m256i*)dp, d); + // } } // swap buffers diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp index 21e0409a..f083e8ea 100644 --- a/src/core/transform/ojph_transform_sse2.cpp +++ b/src/core/transform/ojph_transform_sse2.cpp @@ -485,7 +485,7 @@ namespace ojph { } else { // general case - // 64bit multiplication is not supported in sse2. + // 64bit multiplication is not supported in sse2 if (even) for (ui32 i = h_width; i > 0; --i, sp++, dp++) *dp += (b + a * (sp[0] + sp[1])) >> e; @@ -633,9 +633,7 @@ namespace ojph { } else { // general case - // 32bit multiplication is not supported in sse2; we need sse4.1, - // where we can use _mm_mullo_epi32, which multiplies - // 32bit x 32bit, keeping the LSBs + // 64bit multiplication is not supported in sse2 if (even) for (ui32 i = h_width; i > 0; --i, sp++, dp++) *dp += (b + a * (sp[0] + sp[1])) >> e; @@ -941,9 +939,7 @@ namespace ojph { } else { // general case - // 32bit multiplication is not supported in sse2; we need sse4.1, - // where we can use _mm_mullo_epi32, which multiplies - // 32bit x 32bit, keeping the LSBs + // 64bit multiplication is not supported in sse2 if (ev) for (ui32 i = aug_width; i > 0; --i, sp++, dp++) *dp -= (b + a * (sp[-1] + sp[0])) >> e; From c46732fdf6e4a9e391420cce357e48ca58d41e00 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 18:44:52 +1100 Subject: [PATCH 53/78] Removes warnings. --- src/core/transform/ojph_transform_avx2.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp index 5f6e6419..0a6ea4dd 100644 --- a/src/core/transform/ojph_transform_avx2.cpp +++ b/src/core/transform/ojph_transform_avx2.cpp @@ -271,7 +271,6 @@ namespace ojph { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; const ui8 e = s->rev.Eatk; - __m256i va = _mm256_set1_epi64x(a); __m256i vb = _mm256_set1_epi64x(b); __m256i ve = _mm256_set1_epi64x(1ULL << (63 - e)); @@ -621,7 +620,6 @@ namespace ojph { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; const ui8 e = s->rev.Eatk; - __m256i va = _mm256_set1_epi64x(a); __m256i vb = _mm256_set1_epi64x(b); __m256i ve = _mm256_set1_epi64x(1ULL << (63 - e)); @@ -978,7 +976,6 @@ namespace ojph { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; const ui8 e = s->rev.Eatk; - __m256i va = _mm256_set1_epi64x(a); __m256i vb = _mm256_set1_epi64x(b); __m256i ve = _mm256_set1_epi64x(1ULL << (63 - e)); From da0e69bb2541c46ac9addf1dca8d4dc6a6a76552 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 19:41:47 +1100 Subject: [PATCH 54/78] Test on Mac-14 --- .github/workflows/ccp-workflow.yml | 2 +- src/core/transform/ojph_colour_sse2.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 3d170b54..53baac3b 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -51,7 +51,7 @@ jobs: fail-fast: false matrix: include: [ - { system: MacOS, runner: macos-latest }, + { system: MacOS, runner: macos-14 }, { system: Ubuntu-latest, runner: ubuntu-latest }, ] name: ${{ matrix.system }} Test diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 3829f6a5..1b5598ab 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -388,6 +388,5 @@ namespace ojph { } } } - } } From 8114a1134bbd2751152a69eedac98e1a3a74765c Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 19:44:05 +1100 Subject: [PATCH 55/78] Testing on mac-13 --- .github/workflows/ccp-workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 53baac3b..3ec33304 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -51,7 +51,7 @@ jobs: fail-fast: false matrix: include: [ - { system: MacOS, runner: macos-14 }, + { system: MacOS, runner: macos-13 }, { system: Ubuntu-latest, runner: ubuntu-latest }, ] name: ${{ matrix.system }} Test From c73bfe4527eef1d8fb67ba1c04b58075359d695d Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 20:00:07 +1100 Subject: [PATCH 56/78] reduces reliance on literals --- .github/workflows/ccp-workflow.yml | 2 +- src/core/codestream/ojph_codestream_avx2.cpp | 12 ++++++------ src/core/codestream/ojph_codestream_sse2.cpp | 8 ++++---- src/core/transform/ojph_colour_sse2.cpp | 2 +- src/core/transform/ojph_transform_avx2.cpp | 6 +++--- src/core/transform/ojph_transform_sse2.cpp | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 3ec33304..e31a0d42 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -51,7 +51,7 @@ jobs: fail-fast: false matrix: include: [ - { system: MacOS, runner: macos-13 }, + { system: MacOS, runner: macos-13, macos-latest }, { system: Ubuntu-latest, runner: ubuntu-latest }, ] name: ${{ matrix.system }} Test diff --git a/src/core/codestream/ojph_codestream_avx2.cpp b/src/core/codestream/ojph_codestream_avx2.cpp index c01e0718..7e2aa29c 100644 --- a/src/core/codestream/ojph_codestream_avx2.cpp +++ b/src/core/codestream/ojph_codestream_avx2.cpp @@ -75,7 +75,7 @@ namespace ojph { // convert to sign and magnitude and keep max_val ui32 shift = 31 - K_max; - __m256i m0 = _mm256_set1_epi32((int)0x80000000); + __m256i m0 = _mm256_set1_epi32(INT_MIN); __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); __m256i *p = (__m256i*)sp; for (ui32 i = 0; i < count; i += 8, p += 1, dp += 8) @@ -99,7 +99,7 @@ namespace ojph { //quantize and convert to sign and magnitude and keep max_val __m256 d = _mm256_set1_ps(delta_inv); - __m256i m0 = _mm256_set1_epi32((int)0x80000000); + __m256i m0 = _mm256_set1_epi32(INT_MIN); __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); float *p = (float*)sp; @@ -123,7 +123,7 @@ namespace ojph { { ojph_unused(delta); ui32 shift = 31 - K_max; - __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF); + __m256i m1 = _mm256_set1_epi32(INT_MAX); si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8) { @@ -140,7 +140,7 @@ namespace ojph { float delta, ui32 count) { ojph_unused(K_max); - __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF); + __m256i m1 = _mm256_set1_epi32(INT_MAX); __m256 d = _mm256_set1_ps(delta); float *p = (float*)dp; for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8) @@ -163,7 +163,7 @@ namespace ojph { // convert to sign and magnitude and keep max_val ui32 shift = 63 - K_max; - __m256i m0 = _mm256_set1_epi64x(0x8000000000000000LL); + __m256i m0 = _mm256_set1_epi64x(LLONG_MIN); __m256i zero = _mm256_setzero_si256(); __m256i one = _mm256_set1_epi64x(1); __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); @@ -191,7 +191,7 @@ namespace ojph { ojph_unused(delta); ui32 shift = 63 - K_max; - __m256i m1 = _mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL); + __m256i m1 = _mm256_set1_epi64x(LLONG_MAX); __m256i zero = _mm256_setzero_si256(); __m256i one = _mm256_set1_epi64x(1); si64 *p = (si64*)dp; diff --git a/src/core/codestream/ojph_codestream_sse2.cpp b/src/core/codestream/ojph_codestream_sse2.cpp index 738f24b0..e980d774 100644 --- a/src/core/codestream/ojph_codestream_sse2.cpp +++ b/src/core/codestream/ojph_codestream_sse2.cpp @@ -138,7 +138,7 @@ namespace ojph { { ojph_unused(delta); ui32 shift = 31 - K_max; - __m128i m1 = _mm_set1_epi32(0x7FFFFFFF); + __m128i m1 = _mm_set1_epi32(INT_MAX); __m128i zero = _mm_setzero_si128(); __m128i one = _mm_set1_epi32(1); si32 *p = (si32*)dp; @@ -160,7 +160,7 @@ namespace ojph { float delta, ui32 count) { ojph_unused(K_max); - __m128i m1 = _mm_set1_epi32(0x7FFFFFFF); + __m128i m1 = _mm_set1_epi32(INT_MAX); __m128 d = _mm_set1_ps(delta); float *p = (float*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) @@ -183,7 +183,7 @@ namespace ojph { // convert to sign and magnitude and keep max_val ui32 shift = 63 - K_max; - __m128i m0 = _mm_set1_epi64x(0x8000000000000000LL); + __m128i m0 = _mm_set1_epi64x(LLONG_MIN); __m128i zero = _mm_setzero_si128(); __m128i one = _mm_set1_epi64x(1); __m128i tmax = _mm_loadu_si128((__m128i*)max_val); @@ -211,7 +211,7 @@ namespace ojph { { ojph_unused(delta); ui32 shift = 63 - K_max; - __m128i m1 = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL); + __m128i m1 = _mm_set1_epi64x(LLONG_MAX); __m128i zero = _mm_setzero_si128(); __m128i one = _mm_set1_epi64x(1); si64 *p = (si64*)dp; diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 1b5598ab..988614ba 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -335,7 +335,7 @@ namespace ojph { (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); - __m128i low_bits = _mm_set_epi64x(0, 0xFFFFFFFFFFFFFFFFLL); + __m128i low_bits = _mm_set_epi64x(0, ULLONG_MAX); const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; for (int i = (repeat + 3) >> 2; i > 0; --i) diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp index 0a6ea4dd..68b67ff2 100644 --- a/src/core/transform/ojph_transform_avx2.cpp +++ b/src/core/transform/ojph_transform_avx2.cpp @@ -272,7 +272,7 @@ namespace ojph { const si32 b = s->rev.Batk; const ui8 e = s->rev.Eatk; __m256i vb = _mm256_set1_epi64x(b); - __m256i ve = _mm256_set1_epi64x(1ULL << (63 - e)); + __m256i ve = _mm256_set1_epi64x(1LL << (63 - e)); si64* dst = aug->i64; const si64* src1 = sig->i64, * src2 = other->i64; @@ -621,7 +621,7 @@ namespace ojph { const si32 b = s->rev.Batk; const ui8 e = s->rev.Eatk; __m256i vb = _mm256_set1_epi64x(b); - __m256i ve = _mm256_set1_epi64x(1ULL << (63 - e)); + __m256i ve = _mm256_set1_epi64x(1LL << (63 - e)); // extension lp[-1] = lp[0]; @@ -977,7 +977,7 @@ namespace ojph { const si32 b = s->rev.Batk; const ui8 e = s->rev.Eatk; __m256i vb = _mm256_set1_epi64x(b); - __m256i ve = _mm256_set1_epi64x(1ULL << (63 - e)); + __m256i ve = _mm256_set1_epi64x(1LL << (63 - e)); // extension oth[-1] = oth[0]; diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp index f083e8ea..cafaaa48 100644 --- a/src/core/transform/ojph_transform_sse2.cpp +++ b/src/core/transform/ojph_transform_sse2.cpp @@ -238,7 +238,7 @@ namespace ojph { const si64 b = s->rev.Batk; const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi64x(b); - __m128i ve = _mm_set1_epi64x(1ULL << (63 - e)); + __m128i ve = _mm_set1_epi64x(1LL << (63 - e)); si64* dst = aug->i64; const si64* src1 = sig->i64, * src2 = other->i64; @@ -537,7 +537,7 @@ namespace ojph { const si32 b = s->rev.Batk; const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi64x(b); - __m128i ve = _mm_set1_epi64x(1ULL << (63 - e)); + __m128i ve = _mm_set1_epi64x(1LL << (63 - e)); // extension lp[-1] = lp[0]; @@ -843,7 +843,7 @@ namespace ojph { const si32 b = s->rev.Batk; const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi64x(b); - __m128i ve = _mm_set1_epi64x(1ULL << (63 - e)); + __m128i ve = _mm_set1_epi64x(1LL << (63 - e)); // extension oth[-1] = oth[0]; From 6a4824de53e9cfc036173de2212ecd00fc73c52b Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 20:04:55 +1100 Subject: [PATCH 57/78] Adds missing limits.h --- src/core/codestream/ojph_codestream_avx2.cpp | 1 + src/core/codestream/ojph_codestream_sse2.cpp | 3 ++- src/core/transform/ojph_colour_sse2.cpp | 1 + src/core/transform/ojph_transform_avx2.cpp | 1 + src/core/transform/ojph_transform_sse2.cpp | 1 + 5 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/core/codestream/ojph_codestream_avx2.cpp b/src/core/codestream/ojph_codestream_avx2.cpp index 7e2aa29c..a8e5138b 100644 --- a/src/core/codestream/ojph_codestream_avx2.cpp +++ b/src/core/codestream/ojph_codestream_avx2.cpp @@ -35,6 +35,7 @@ // Date: 15 May 2022 //***************************************************************************/ +#include #include #include "ojph_defs.h" diff --git a/src/core/codestream/ojph_codestream_sse2.cpp b/src/core/codestream/ojph_codestream_sse2.cpp index e980d774..3352bcd8 100644 --- a/src/core/codestream/ojph_codestream_sse2.cpp +++ b/src/core/codestream/ojph_codestream_sse2.cpp @@ -35,6 +35,7 @@ // Date: 15 May 2022 //***************************************************************************/ +#include #include #include "ojph_defs.h" @@ -81,7 +82,7 @@ namespace ojph { // convert to sign and magnitude and keep max_val ui32 shift = 31 - K_max; - __m128i m0 = _mm_set1_epi32((int)0x80000000); + __m128i m0 = _mm_set1_epi32(INT_MIN); __m128i zero = _mm_setzero_si128(); __m128i one = _mm_set1_epi32(1); __m128i tmax = _mm_loadu_si128((__m128i*)max_val); diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 988614ba..b27afc02 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -35,6 +35,7 @@ // Date: 11 October 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp index 68b67ff2..35e652d0 100644 --- a/src/core/transform/ojph_transform_avx2.cpp +++ b/src/core/transform/ojph_transform_avx2.cpp @@ -35,6 +35,7 @@ // Date: 28 August 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp index cafaaa48..742a2e6d 100644 --- a/src/core/transform/ojph_transform_sse2.cpp +++ b/src/core/transform/ojph_transform_sse2.cpp @@ -35,6 +35,7 @@ // Date: 28 August 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" From ae28cd0feb2c596700a2751dd899b04c7791cf66 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 20:18:37 +1100 Subject: [PATCH 58/78] Let's see without simd. --- .github/workflows/ccp-workflow.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index e31a0d42..03949735 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -51,7 +51,8 @@ jobs: fail-fast: false matrix: include: [ - { system: MacOS, runner: macos-13, macos-latest }, + { system: MacOS-13, runner: macos-13 }, + { system: MacOS-latest, runner: macos-latest }, { system: Ubuntu-latest, runner: ubuntu-latest }, ] name: ${{ matrix.system }} Test @@ -59,7 +60,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: cmake - run: cmake -DOJPH_BUILD_TESTS=yes .. + run: cmake -DOJPH_BUILD_TESTS=yes -DOJPH_DISABLE_SIMD=yes .. working-directory: build - name: build run: make From 3dc2465741c68101dd086c505cc11d305ffae6ea Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 20:22:25 +1100 Subject: [PATCH 59/78] Put back SIMD. --- .github/workflows/ccp-workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 03949735..94ca3eba 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -60,7 +60,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: cmake - run: cmake -DOJPH_BUILD_TESTS=yes -DOJPH_DISABLE_SIMD=yes .. + run: cmake -DOJPH_BUILD_TESTS=yes .. working-directory: build - name: build run: make From 4339c8fcc0068afa98cfe2705c10dbe204e21833 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 5 Nov 2024 20:34:11 +1100 Subject: [PATCH 60/78] A warning fix. --- src/core/transform/ojph_colour_sse2.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index b27afc02..c6580cf7 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -336,7 +336,7 @@ namespace ojph { (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); - __m128i low_bits = _mm_set_epi64x(0, ULLONG_MAX); + __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX); const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; for (int i = (repeat + 3) >> 2; i > 0; --i) From 9834f171454b1772eb2ed85b98329b4b4793b85a Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Wed, 6 Nov 2024 21:47:34 +1100 Subject: [PATCH 61/78] Incomplete colour_avx2, avx512, and wasm. --- src/core/transform/ojph_colour_local.h | 12 +- src/core/transform/ojph_colour_sse2.cpp | 145 +++++++++++++++++---- src/core/transform/ojph_transform_avx2.cpp | 2 +- src/core/transform/ojph_transform_sse2.cpp | 2 +- 4 files changed, 127 insertions(+), 34 deletions(-) diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index 5314c53b..71cf4541 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -229,12 +229,16 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void avx2_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width); + void avx2_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// void avx2_rct_forward( diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index c6580cf7..a529c66b 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -53,7 +53,7 @@ namespace ojph { static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) { // note than m must be obtained using - // __m128i ve = _mm_set1_epi64x(1ULL << (63 - amt)); + // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt)); __m128i x = _mm_srli_epi64(a, amt); x = _mm_xor_si128(x, m); __m128i result = _mm_sub_epi64(x, m); @@ -63,23 +63,19 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero) { - __m128i s, t; - s = _mm_unpacklo_epi32(a, zero); // missing extended -ve + __m128i t; t = _mm_cmplt_epi32(a, zero); // get -ve - t = _mm_unpacklo_epi32(zero, t); - s = _mm_or_si128(t, s); // put -ve - return s; + t = _mm_unpacklo_epi32(a, t); + return t; } ////////////////////////////////////////////////////////////////////////// static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero) { - __m128i s, t; - s = _mm_unpackhi_epi32(a, zero); // missing extended -ve + __m128i t; t = _mm_cmplt_epi32(a, zero); // get -ve - t = _mm_unpackhi_epi32(zero, t); - s = _mm_or_si128(t, s); // put -ve - return s; + t = _mm_unpackhi_epi32(a, t); + return t; } ////////////////////////////////////////////////////////////////////////// @@ -95,16 +91,33 @@ namespace ojph { { const si32 *sp = src_line->i32 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; - si32 s = (si32)shift; - for (ui32 i = width; i > 0; --i) - *dp++ = *sp++ + s; + __m128i sh = _mm_set1_epi32((si32)shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128i s = _mm_loadu_si128((__m128i*)sp); + s = _mm_add_epi32(s, sh); + _mm_storeu_si128((__m128i*)dp, s); + } } else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; - for (ui32 i = width; i > 0; --i) - *dp++ = *sp++ + shift; + __m128i zero = _mm_setzero_si128(); + __m128i sh = _mm_set1_epi64x(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128i s, t; + s = _mm_loadu_si128((__m128i*)sp); + + t = sse2_cvtlo_epi32_epi64(s, zero); + t = _mm_add_epi64(t, sh); + _mm_storeu_si128((__m128i*)dp, t); + + t = sse2_cvthi_epi32_epi64(s, zero); + t = _mm_add_epi64(t, sh); + _mm_storeu_si128((__m128i*)dp + 1, t); + } } } else @@ -113,8 +126,26 @@ namespace ojph { assert(dst_line->flags | line_buf::LFT_32BIT); const si64 *sp = src_line->i64 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; - for (ui32 i = width; i > 0; --i) - *dp++ = (si32)(*sp++ + shift); + __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX); + __m128i sh = _mm_set1_epi64x(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128i s, t; + s = _mm_loadu_si128((__m128i*)sp); + s = _mm_add_epi64(s, sh); + + t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm_and_si128(low_bits, t); + + s = _mm_loadu_si128((__m128i*)sp + 1); + s = _mm_add_epi64(s, sh); + + s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0)); + s = _mm_andnot_si128(low_bits, s); + + t = _mm_or_si128(s, t); + _mm_storeu_si128((__m128i*)dp, t); + } } } @@ -131,19 +162,49 @@ namespace ojph { { const si32 *sp = src_line->i32 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; - si32 s = (si32)shift; - for (ui32 i = width; i > 0; --i) { - const si32 v = *sp++; - *dp++ = v >= 0 ? v : (- v - s); + __m128i sh = _mm_set1_epi32((si32)(-shift)); + __m128i zero = _mm_setzero_si128(); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + __m128i s = _mm_loadu_si128((__m128i*)sp); + __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value + __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + s = _mm_andnot_si128(c, s); // keep only +ve or 0 + s = _mm_or_si128(s, v_m_sh); // combine + _mm_storeu_si128((__m128i*)dp, s); } } else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; - for (ui32 i = width; i > 0; --i) { - const si64 v = *sp++; - *dp++ = v >= 0 ? v : (- v - shift); + __m128i sh = _mm_set1_epi64x(-shift); + __m128i zero = _mm_setzero_si128(); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + __m128i s, t, u, c, v_m_sh; + s = _mm_loadu_si128((__m128i*)sp); + + t = _mm_cmplt_epi32(s, zero); // find -ve 32bit -1 + u = _mm_unpacklo_epi32(s, t); // correct 64bit data + c = _mm_unpacklo_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm_sub_epi64(sh, u); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + u = _mm_andnot_si128(c, u); // keep only +ve or 0 + u = _mm_or_si128(u, v_m_sh); // combine + + _mm_storeu_si128((__m128i*)dp, u); + u = _mm_unpackhi_epi32(s, t); // correct 64bit data + c = _mm_unpackhi_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm_sub_epi64(sh, u); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + u = _mm_andnot_si128(c, u); // keep only +ve or 0 + u = _mm_or_si128(u, v_m_sh); // combine + + _mm_storeu_si128((__m128i*)dp + 1, u); } } } @@ -153,9 +214,37 @@ namespace ojph { assert(dst_line->flags | line_buf::LFT_32BIT); const si64 *sp = src_line->i64 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; - for (ui32 i = width; i > 0; --i) { - const si64 v = *sp++; - *dp++ = (si32)(v >= 0 ? v : (- v - shift)); + __m128i sh = _mm_set1_epi64x(-shift); + __m128i zero = _mm_setzero_si128(); + __m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + __m128i s, t, p, n, m, tm; + s = _mm_loadu_si128((__m128i*)sp); + + tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value + m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b + tm = _mm_sub_epi64(sh, s); // - shift - value + n = _mm_and_si128(m, tm); // -ve + p = _mm_andnot_si128(m, s); // +ve + tm = _mm_or_si128(n, p); + tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm_and_si128(half_mask, tm); + + s = _mm_loadu_si128((__m128i*)sp + 1); + tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value + m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b + tm = _mm_sub_epi64(sh, s); // - shift - value + n = _mm_and_si128(m, tm); // -ve + p = _mm_andnot_si128(m, s); // +ve + tm = _mm_or_si128(n, p); + tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0)); + tm = _mm_andnot_si128(half_mask, tm); + + t = _mm_or_si128(t, tm); + _mm_storeu_si128((__m128i*)dp, t); } } } diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp index 35e652d0..cb7cd61e 100644 --- a/src/core/transform/ojph_transform_avx2.cpp +++ b/src/core/transform/ojph_transform_avx2.cpp @@ -58,7 +58,7 @@ namespace ojph { __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) { // note than m must be obtained using - // __m256i ve = _mm256_set1_epi64x(1ULL << (63 - amt)); + // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt)); __m256i x = _mm256_srli_epi64(a, amt); x = _mm256_xor_si256(x, m); __m256i result = _mm256_sub_epi64(x, m); diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp index 742a2e6d..1236b7cd 100644 --- a/src/core/transform/ojph_transform_sse2.cpp +++ b/src/core/transform/ojph_transform_sse2.cpp @@ -57,7 +57,7 @@ namespace ojph { static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) { // note than m must be obtained using - // __m128i ve = _mm_set1_epi64x(1ULL << (63 - amt)); + // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt)); __m128i x = _mm_srli_epi64(a, amt); x = _mm_xor_si128(x, m); __m128i result = _mm_sub_epi64(x, m); From 75259b820c8fa580736ce48729abc59d16fc5ca8 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Thu, 7 Nov 2024 08:24:48 +1100 Subject: [PATCH 62/78] I should have committed these incomplete files. --- src/core/transform/ojph_colour.cpp | 6 +- src/core/transform/ojph_colour_avx2.cpp | 200 +++++++++++++++++++----- 2 files changed, 166 insertions(+), 40 deletions(-) diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 6289ae13..a4effbf7 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -163,10 +163,10 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { - // cnvrt_si32_to_si32_shftd = avx2_cnvrt_si32_to_si32_shftd; - // cnvrt_si32_to_si32_nlt_type3 = avx2_cnvrt_si32_to_si32_nlt_type3; + //rev_convert = avx2_rev_convert; + //rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3; // rct_forward = avx2_rct_forward; - // rct_backward = avx2_rct_backward; + rct_backward = avx2_rct_backward; } #endif // !OJPH_DISABLE_AVX2 diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 14e5a35d..9e550d3f 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -35,10 +35,12 @@ // Date: 11 October 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" #include "ojph_arch.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include @@ -46,6 +48,20 @@ namespace ojph { namespace local { + ///////////////////////////////////////////////////////////////////////// + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline + __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) + { + // note than m must be obtained using + // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt)); + __m256i x = _mm256_srli_epi64(a, amt); + x = _mm256_xor_si256(x, m); + __m256i result = _mm256_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, ui32 width) @@ -80,50 +96,160 @@ namespace ojph { } } + // + // _mm256_cvtepi32_epi64 + // + + // ////////////////////////////////////////////////////////////////////////// + // void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, + // si32 *y, si32 *cb, si32 *cr, ui32 repeat) + // { + // for (int i = (repeat + 7) >> 3; i > 0; --i) + // { + // __m256i mr = _mm256_load_si256((__m256i*)r); + // __m256i mg = _mm256_load_si256((__m256i*)g); + // __m256i mb = _mm256_load_si256((__m256i*)b); + // __m256i t = _mm256_add_epi32(mr, mb); + // t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1)); + // _mm256_store_si256((__m256i*)y, _mm256_srai_epi32(t, 2)); + // t = _mm256_sub_epi32(mb, mg); + // _mm256_store_si256((__m256i*)cb, t); + // t = _mm256_sub_epi32(mr, mg); + // _mm256_store_si256((__m256i*)cr, t); + + // r += 8; g += 8; b += 8; + // y += 8; cb += 8; cr += 8; + // } + // } + ////////////////////////////////////////////////////////////////////////// - void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void avx2_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) { - for (int i = (repeat + 7) >> 3; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - __m256i mr = _mm256_load_si256((__m256i*)r); - __m256i mg = _mm256_load_si256((__m256i*)g); - __m256i mb = _mm256_load_si256((__m256i*)b); - __m256i t = _mm256_add_epi32(mr, mb); - t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1)); - _mm256_store_si256((__m256i*)y, _mm256_srai_epi32(t, 2)); - t = _mm256_sub_epi32(mb, mg); - _mm256_store_si256((__m256i*)cb, t); - t = _mm256_sub_epi32(mr, mg); - _mm256_store_si256((__m256i*)cr, t); - - r += 8; g += 8; b += 8; - y += 8; cb += 8; cr += 8; - } - } + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i my = _mm256_load_si256((__m256i*)yp); + __m256i mcb = _mm256_load_si256((__m256i*)cbp); + __m256i mcr = _mm256_load_si256((__m256i*)crp); - ////////////////////////////////////////////////////////////////////////// - void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) - { - for (int i = (repeat + 7) >> 3; i > 0; --i) + __m256i t = _mm256_add_epi32(mcb, mcr); + t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2)); + _mm256_store_si256((__m256i*)gp, t); + __m256i u = _mm256_add_epi32(mcb, t); + _mm256_store_si256((__m256i*)bp, u); + u = _mm256_add_epi32(mcr, t); + _mm256_store_si256((__m256i*)rp, u); + + yp += 8; cbp += 8; crp += 8; + rp += 8; gp += 8; bp += 8; + } + } + else { - __m256i my = _mm256_load_si256((__m256i*)y); - __m256i mcb = _mm256_load_si256((__m256i*)cb); - __m256i mcr = _mm256_load_si256((__m256i*)cr); - - __m256i t = _mm256_add_epi32(mcb, mcr); - t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2)); - _mm256_store_si256((__m256i*)g, t); - __m256i u = _mm256_add_epi32(mcb, t); - _mm256_store_si256((__m256i*)b, u); - u = _mm256_add_epi32(mcr, t); - _mm256_store_si256((__m256i*)r, u); - - y += 8; cb += 8; cr += 8; - r += 8; g += 8; b += 8; + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); + __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, 0, (si64)ULLONG_MAX); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i my, mcb, mcr, tr, tg, tb; + my = _mm256_load_si256((__m256i*)yp); + mcb = _mm256_load_si256((__m256i*)cbp); + mcr = _mm256_load_si256((__m256i*)crp); + + tg = _mm256_add_epi64(mcb, mcr); + tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2)); + tb = _mm256_add_epi64(mcb, tg); + tr = _mm256_add_epi64(mcr, tg); + + __m256i mr, mg, mb; + mr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0)); + mr = _mm256_and_si256(low_bits, mr); + mg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0)); + mg = _mm256_and_si256(low_bits, mg); + mb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0)); + mb = _mm256_and_si256(low_bits, mb); + + yp += 4; cbp += 4; crp += 4; + + my = _mm256_load_si256((__m256i*)yp); + mcb = _mm256_load_si256((__m256i*)cbp); + mcr = _mm256_load_si256((__m256i*)crp); + + tg = _mm256_add_epi64(mcb, mcr); + tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2)); + tb = _mm256_add_epi64(mcb, tg); + tr = _mm256_add_epi64(mcr, tg); + + tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0)); + tr = _mm256_andnot_si256(low_bits, tr); + mr = _mm256_or_si256(mr, tr); + tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0)); + tg = _mm256_andnot_si256(low_bits, tg); + mg = _mm256_or_si256(mg, tg); + tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0)); + tb = _mm256_andnot_si256(low_bits, tb); + mb = _mm256_or_si256(mb, tb); + + _mm256_store_si256((__m256i*)rp, mr); + _mm256_store_si256((__m256i*)gp, mg); + _mm256_store_si256((__m256i*)bp, mb); + + yp += 4; cbp += 4; crp += 4; + rp += 8; gp += 8; bp += 8; + } } } + + // ////////////////////////////////////////////////////////////////////////// + // void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, + // si32 *r, si32 *g, si32 *b, ui32 repeat) + // { + // for (int i = (repeat + 7) >> 3; i > 0; --i) + // { + // __m256i my = _mm256_load_si256((__m256i*)y); + // __m256i mcb = _mm256_load_si256((__m256i*)cb); + // __m256i mcr = _mm256_load_si256((__m256i*)cr); + + // __m256i t = _mm256_add_epi32(mcb, mcr); + // t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2)); + // _mm256_store_si256((__m256i*)g, t); + // __m256i u = _mm256_add_epi32(mcb, t); + // _mm256_store_si256((__m256i*)b, u); + // u = _mm256_add_epi32(mcr, t); + // _mm256_store_si256((__m256i*)r, u); + + // y += 8; cb += 8; cr += 8; + // r += 8; g += 8; b += 8; + // } + // } + } } From 9d2a0481e09699417a49e8047623755a5222b89f Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Thu, 7 Nov 2024 10:06:42 +1100 Subject: [PATCH 63/78] splitting decoder into 32/64 to solve macOS failing tests. --- src/core/coding/ojph_block_decoder32.cpp | 1616 +++++++++++++++++ ...k_decoder.cpp => ojph_block_decoder64.cpp} | 1045 ----------- 2 files changed, 1616 insertions(+), 1045 deletions(-) create mode 100644 src/core/coding/ojph_block_decoder32.cpp rename src/core/coding/{ojph_block_decoder.cpp => ojph_block_decoder64.cpp} (61%) diff --git a/src/core/coding/ojph_block_decoder32.cpp b/src/core/coding/ojph_block_decoder32.cpp new file mode 100644 index 00000000..f54c77ed --- /dev/null +++ b/src/core/coding/ojph_block_decoder32.cpp @@ -0,0 +1,1616 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2019, The University of New South Wales, Australia +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_block_decoder.cpp +// Author: Aous Naman +// Date: 13 May 2022 +//***************************************************************************/ + +//***************************************************************************/ +/** @file ojph_block_decoder.cpp + * @brief implements a HTJ2K block decoder + */ + +#include +#include + +#include +#include +#include "ojph_block_common.h" +#include "ojph_block_decoder.h" +#include "ojph_arch.h" +#include "ojph_message.h" + +namespace ojph { + namespace local { + + //************************************************************************/ + /** @brief MEL state structure for reading and decoding the MEL bitstream + * + * A number of events is decoded from the MEL bitstream ahead of time + * and stored in run/num_runs. + * Each run represents the number of zero events before a one event. + */ + struct dec_mel_st { + dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false), + k(0), num_runs(0), runs(0) + {} + // data decoding machinery + ui8* data; //!bits > 32) //there are enough bits in the tmp variable + return; // return without reading new data + + ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted + if (melp->size > 4) { // if there is data in the MEL segment + val = *(ui32*)melp->data; // read 32 bits from MEL data + melp->data += 4; // advance pointer + melp->size -= 4; // reduce counter + } + else if (melp->size > 0) + { // 4 or less + int i = 0; + while (melp->size > 1) { + ui32 v = *melp->data++; // read one byte at a time + ui32 m = ~(0xFFu << i); // mask of location + val = (val & m) | (v << i);// put one byte in its correct location + --melp->size; + i += 8; + } + // size equal to 1 + ui32 v = *melp->data++; // the one before the last is different + v |= 0xF; // MEL and VLC segments can overlap + ui32 m = ~(0xFFu << i); + val = (val & m) | (v << i); + --melp->size; + } + + // next we unstuff them before adding them to the buffer + int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if + // the previously read byte requires + // unstuffing + + // data is unstuffed and accumulated in t + // bits has the number of bits in t + ui32 t = val & 0xFF; + bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing + bits -= unstuff; // there is one less bit in t if unstuffing is needed + t = t << (8 - unstuff); // move up to make room for the next byte + + //this is a repeat of the above + t |= (val>>8) & 0xFF; + unstuff = (((val >> 8) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>16) & 0xFF; + unstuff = (((val >> 16) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>24) & 0xFF; + melp->unstuff = (((val >> 24) & 0xFF) == 0xFF); + + // move t to tmp, and push the result all the way up, so we read from + // the MSB + melp->tmp |= ((ui64)t) << (64 - bits - melp->bits); + melp->bits += bits; //increment the number of bits in tmp + } + + //************************************************************************/ + /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs + * + * Runs are stored in "runs" and the number of runs in "num_runs". + * Each run represents a number of zero events that may or may not + * terminate in a 1 event. + * Each run is stored in 7 bits. The LSB is 1 if the run terminates in + * a 1 event, 0 otherwise. The next 6 bits, for the case terminating + * with 1, contain the number of consecutive 0 zero events * 2; for the + * case terminating with 0, they store (number of consecutive 0 zero + * events - 1) * 2. + * A total of 6 bits (made up of 1 + 5) should have been enough. + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + void mel_decode(dec_mel_st *melp) + { + static const int mel_exp[13] = { //MEL exponents + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5 + }; + + if (melp->bits < 6) // if there are less than 6 bits in tmp + mel_read(melp); // then read from the MEL bitstream + // 6 bits is the largest decodable MEL cwd + + //repeat so long that there is enough decodable bits in tmp, + // and the runs store is not full (num_runs < 8) + while (melp->bits >= 6 && melp->num_runs < 8) + { + int eval = mel_exp[melp->k]; // number of bits associated with state + int run = 0; + if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB) + { //one is found + run = 1 << eval; + run--; // consecutive runs of 0 events - 1 + melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12 + melp->tmp <<= 1; // consume one bit from tmp + melp->bits -= 1; + run = run << 1; // a stretch of zeros not terminating in one + } + else + { //0 is found + run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1); + melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0 + melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6) + melp->bits -= eval + 1; + run = (run << 1) + 1; // a stretch of zeros terminating with one + } + eval = melp->num_runs * 7; // 7 bits per run + melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient + melp->runs |= ((ui64)run) << eval; // store the value in runs + melp->num_runs++; // increment count + } + } + + //************************************************************************/ + /** @brief Initiates a dec_mel_st structure for MEL decoding and reads + * some bytes in order to get the read address to a multiple + * of 4 + * + * @param [in] melp is a pointer to dec_mel_st structure + * @param [in] bbuf is a pointer to byte buffer + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup) + { + melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL + melp->bits = 0; // 0 bits in tmp + melp->tmp = 0; // + melp->unstuff = false; // no unstuffing + melp->size = scup - 1; // size is the length of MEL+VLC-1 + melp->k = 0; // 0 for state + melp->num_runs = 0; // num_runs is 0 + melp->runs = 0; // + + //This code is borrowed; original is for a different architecture + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment + int num = 4 - (int)(intptr_t(melp->data) & 0x3); + for (int i = 0; i < num; ++i) { // this code is similar to mel_read + assert(melp->unstuff == false || melp->data[0] <= 0x8F); + ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed + //set data to 0xFF + if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF + // see the standard + melp->data += melp->size-- > 0; //increment if the end is not reached + int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1 + melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp + melp->bits += d_bits; //increment tmp by number of bits + melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs + //unstuffing + } + melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit + // is the MSB + } + + //************************************************************************/ + /** @brief Retrieves one run from dec_mel_st; if there are no runs stored + * MEL segment is decoded + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + int mel_get_run(dec_mel_st *melp) + { + if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment + mel_decode(melp); + + int t = melp->runs & 0x7F; //retrieve one run + melp->runs >>= 7; // remove the retrieved run + melp->num_runs--; + return t; // return run + } + + //************************************************************************/ + /** @brief A structure for reading and unstuffing a segment that grows + * backward, such as VLC and MRP + */ + struct rev_struct { + rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false) + {} + //storage + ui8* data; //!bits > 32) // if there are more than 32 bits in tmp, then + return; // reading 32 bits can overflow vlcp->tmp + ui32 val = 0; + //the next line (the if statement) needs to be tested first + if (vlcp->size > 3) // if there are more than 3 bytes left in VLC + { + // (vlcp->data - 3) move pointer back to read 32 bits at once + val = *(ui32*)(vlcp->data - 3); // then read 32 bits + vlcp->data -= 4; // move data pointer back by 4 + vlcp->size -= 4; // reduce available byte by 4 + } + else if (vlcp->size > 0) + { // 4 or less + int i = 24; + while (vlcp->size > 0) { + ui32 v = *vlcp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --vlcp->size; + i -= 8; + } + } + + //accumulate in tmp, number of bits in tmp are stored in bits + ui32 tmp = val >> 24; //start with the MSB byte + ui32 bits; + + // test unstuff (previous byte is >0x8F), and this byte is 0x7F + bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0); + bool unstuff = (val >> 24) > 0x8F; //this is for the next byte + + tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte + bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 16) & 0xFF) > 0x8F; + + tmp |= ((val >> 8) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 8) & 0xFF) > 0x8F; + + tmp |= (val & 0xFF) << bits; + bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = (val & 0xFF) > 0x8F; + + // now move the read and unstuffed bits into vlcp->tmp + vlcp->tmp |= (ui64)tmp << vlcp->bits; + vlcp->bits += bits; + vlcp->unstuff = unstuff; // this for the next read + } + + //************************************************************************/ + /** @brief Initiates the rev_struct structure and reads a few bytes to + * move the read address to multiple of 4 + * + * There is another similar rev_init_mrp subroutine. The difference is + * that this one, rev_init, discards the first 12 bits (they have the + * sum of the lengths of VLC and MEL segments), and first unstuff depends + * on first 4 bits. + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup) + { + //first byte has only the upper 4 bits + vlcp->data = data + lcup - 2; + + //size can not be larger than this, in fact it should be smaller + vlcp->size = scup - 2; + + ui32 d = *vlcp->data--; // read one byte (this is a half byte) + vlcp->tmp = d >> 4; // both initialize and set + vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard + vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream. + // To read 32 bits, read from (vlcp->data - 3) + int num = 1 + (int)(intptr_t(vlcp->data) & 0x3); + int tnum = num < vlcp->size ? num : vlcp->size; + for (int i = 0; i < tnum; ++i) { + ui64 d; + d = *vlcp->data--; // read one byte and move read pointer + //check if the last byte was >0x8F (unstuff == true) and this is 0x7F + ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp + vlcp->bits += d_bits; + vlcp->unstuff = d > 0x8F; // for next byte + } + vlcp->size -= tnum; + rev_read(vlcp); // read another 32 buts + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, vlcp->tmp must have no less than 33 bits + * + * @param [in] vlcp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch(rev_struct *vlcp) + { + if (vlcp->bits < 32) // if there are less then 32 bits, read more + { + rev_read(vlcp); // read 32 bits, but unstuffing might reduce this + if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits + rev_read(vlcp); // read another 32 + } + return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui32 rev_advance(rev_struct *vlcp, ui32 num_bits) + { + assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits + vlcp->tmp >>= num_bits; // remove bits + vlcp->bits -= num_bits; // decrement the number of bits + return (ui32)vlcp->tmp; + } + + //************************************************************************/ + /** @brief Reads and unstuffs from rev_struct + * + * This is different than rev_read in that this fills in zeros when the + * the available data is consumed. The other does not care about the + * values when all data is consumed. + * + * See rev_read for more information about unstuffing + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + void rev_read_mrp(rev_struct *mrp) + { + //process 4 bytes at a time + if (mrp->bits > 32) + return; + ui32 val = 0; + if (mrp->size > 3) // If there are 3 byte or more + { // (mrp->data - 3) move pointer back to read 32 bits at once + val = *(ui32*)(mrp->data - 3); // read 32 bits + mrp->data -= 4; // move back pointer + mrp->size -= 4; // reduce count + } + else if (mrp->size > 0) + { + int i = 24; + while (mrp->size > 0) { + ui32 v = *mrp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --mrp->size; + i -= 8; + } + } + + //accumulate in tmp, and keep count in bits + ui32 bits, tmp = val >> 24; + + //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F + bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0); + bool unstuff = (val >> 24) > 0x8F; + + //process the next byte + tmp |= ((val >> 16) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 16) & 0xFF) > 0x8F; + + tmp |= ((val >> 8) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 8) & 0xFF) > 0x8F; + + tmp |= (val & 0xFF) << bits; + bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = (val & 0xFF) > 0x8F; + + mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer + mrp->bits += bits; + mrp->unstuff = unstuff; // next byte + } + + //************************************************************************/ + /** @brief Initialized rev_struct structure for MRP segment, and reads + * a number of bytes such that the next 32 bits read are from + * an address that is a multiple of 4. Note this is designed for + * an architecture that read size must be compatible with the + * alignment of the read address + * + * There is another similar subroutine rev_init. This subroutine does + * NOT skip the first 12 bits, and starts with unstuff set to true. + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] len2 is the length of SPP+MRP segments + */ + static inline + void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2) + { + mrp->data = data + lcup + len2 - 1; + mrp->size = len2; + mrp->unstuff = true; + mrp->bits = 0; + mrp->tmp = 0; + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream + int num = 1 + (int)(intptr_t(mrp->data) & 0x3); + for (int i = 0; i < num; ++i) { + ui64 d; + //read a byte, 0 if no more data + d = (mrp->size-- > 0) ? *mrp->data-- : 0; + //check if unstuffing is needed + ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp + mrp->bits += d_bits; + mrp->unstuff = d > 0x8F; // for next byte + } + rev_read_mrp(mrp); + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, mrp->tmp must have no less than 33 bits + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch_mrp(rev_struct *mrp) + { + if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp + { + rev_read_mrp(mrp); // read 30-32 bits from mrp + if (mrp->bits < 32) // if there is a space of 32 bits + rev_read_mrp(mrp); // read more + } + return (ui32)mrp->tmp; // return the head of mrp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits) + { + assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits + mrp->tmp >>= num_bits; // discard the lowest num_bits bits + mrp->bits -= num_bits; + return (ui32)mrp->tmp; // return data after consumption + } + + //************************************************************************/ + /** @brief State structure for reading and unstuffing of forward-growing + * bitstreams; these are: MagSgn and SPP bitstreams + */ + struct frwd_struct { + const ui8* data; //! + static inline + void frwd_read(frwd_struct *msp) + { + assert(msp->bits <= 32); // assert that there is a space for 32 bits + + ui32 val = 0; + if (msp->size > 3) { + val = *(ui32*)msp->data; // read 32 bits + msp->data += 4; // increment pointer + msp->size -= 4; // reduce size + } + else if (msp->size > 0) + { + int i = 0; + val = X != 0 ? 0xFFFFFFFFu : 0; + while (msp->size > 0) { + ui32 v = *msp->data++; // read one byte at a time + ui32 m = ~(0xFFu << i); // mask of location + val = (val & m) | (v << i);// put one byte in its correct location + --msp->size; + i += 8; + } + } + else + val = X != 0 ? 0xFFFFFFFFu : 0; + + // we accumulate in t and keep a count of the number of bits in bits + ui32 bits = 8 - msp->unstuff; + ui32 t = val & 0xFF; + bool unstuff = ((val & 0xFF) == 0xFF); // Do we need unstuffing next? + + t |= ((val >> 8) & 0xFF) << bits; + bits += 8 - unstuff; + unstuff = (((val >> 8) & 0xFF) == 0xFF); + + t |= ((val >> 16) & 0xFF) << bits; + bits += 8 - unstuff; + unstuff = (((val >> 16) & 0xFF) == 0xFF); + + t |= ((val >> 24) & 0xFF) << bits; + bits += 8 - unstuff; + msp->unstuff = (((val >> 24) & 0xFF) == 0xFF); // for next byte + + msp->tmp |= ((ui64)t) << msp->bits; // move data to msp->tmp + msp->bits += bits; + } + + //************************************************************************/ + /** @brief Initialize frwd_struct struct and reads some bytes + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + * @param [in] data is a pointer to the start of data + * @param [in] size is the number of byte in the bitstream + */ + template + static inline + void frwd_init(frwd_struct *msp, const ui8* data, int size) + { + msp->data = data; + msp->tmp = 0; + msp->bits = 0; + msp->unstuff = 0; + msp->size = size; + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the bitstream + int num = 4 - (int)(intptr_t(msp->data) & 0x3); + for (int i = 0; i < num; ++i) + { + ui64 d; + //read a byte if the buffer is not exhausted, otherwise set it to X + d = msp->size-- > 0 ? *msp->data++ : X; + msp->tmp |= (d << msp->bits); // store data in msp->tmp + msp->bits += 8 - msp->unstuff; // number of bits added to msp->tmp + msp->unstuff = ((d & 0xFF) == 0xFF); // unstuffing for next byte + } + frwd_read(msp); // read 32 bits more + } + + //************************************************************************/ + /** @brief Consume num_bits bits from the bitstream of frwd_struct + * + * @param [in] msp is a pointer to frwd_struct + * @param [in] num_bits is the number of bit to consume + */ + static inline + void frwd_advance(frwd_struct *msp, ui32 num_bits) + { + assert(num_bits <= msp->bits); + msp->tmp >>= num_bits; // consume num_bits + msp->bits -= num_bits; + } + + //************************************************************************/ + /** @brief Fetches 32 bits from the frwd_struct bitstream + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + */ + template + static inline + ui32 frwd_fetch(frwd_struct *msp) + { + if (msp->bits < 32) + { + frwd_read(msp); + if (msp->bits < 32) //need to test + frwd_read(msp); + } + return (ui32)msp->tmp; + } + + //************************************************************************/ + /** @brief Decodes one codeblock, processing the cleanup, siginificance + * propagation, and magnitude refinement pass + * + * @param [in] coded_data is a pointer to bitstream + * @param [in] decoded_data is a pointer to decoded codeblock data buf. + * @param [in] missing_msbs is the number of missing MSBs + * @param [in] num_passes is the number of passes: 1 if CUP only, + * 2 for CUP+SPP, and 3 for CUP+SPP+MRP + * @param [in] lengths1 is the length of cleanup pass + * @param [in] lengths2 is the length of refinement passes (either SPP + * only or SPP+MRP) + * @param [in] width is the decoded codeblock width + * @param [in] height is the decoded codeblock height + * @param [in] stride is the decoded codeblock buffer stride + * @param [in] stripe_causal is true for stripe causal mode + */ + bool ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) + { + static bool insufficient_precision = false; + static bool modify_code = false; + static bool truncate_spp_mrp = false; + + if (num_passes > 1 && lengths2 == 0) + { + OJPH_WARN(0x00010001, "A malformed codeblock that has more than " + "one coding pass, but zero length for " + "2nd and potential 3rd pass."); + num_passes = 1; + } + + if (num_passes > 3) + { + OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " + "This codeblocks has %d passes.", + num_passes); + return false; + } + + if (missing_msbs > 30) // p < 0 + { + if (insufficient_precision == false) + { + insufficient_precision = true; + OJPH_WARN(0x00010003, "32 bits are not enough to decode this " + "codeblock. This message will not be " + "displayed again."); + } + return false; + } + else if (missing_msbs == 30) // p == 0 + { // not enough precision to decode and set the bin center to 1 + if (modify_code == false) { + modify_code = true; + OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " + "pass. The code can be modified to support " + "this case. This message will not be " + "displayed again."); + } + return false; // 32 bits are not enough to decode this + } + else if (missing_msbs == 29) // if p is 1, then num_passes must be 1 + { + if (num_passes > 1) { + num_passes = 1; + if (truncate_spp_mrp == false) { + truncate_spp_mrp = true; + OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " + "nor MagRef passes; both will be skipped. " + "This message will not be displayed " + "again."); + } + } + } + ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP + // There is a way to handle the case of p == 0, but a different path + // is required + + if (lengths1 < 2) + { + OJPH_WARN(0x00010006, "Wrong codeblock length."); + return false; + } + + // read scup and fix the bytes there + int lcup, scup; + lcup = (int)lengths1; // length of CUP + //scup is the length of MEL + VLC + scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF); + if (scup < 2 || scup > lcup || scup > 4079) //something is wrong + return false; + + // The temporary storage scratch holds two types of data in an + // interleaved fashion. The interleaving allows us to use one + // memory pointer. + // We have one entry for a decoded VLC code, and one entry for UVLC. + // Entries are 16 bits each, corresponding to one quad, + // but since we want to use XMM registers of the SSE family + // of SIMD; we allocated 16 bytes or more per quad row; that is, + // the width is no smaller than 16 bytes (or 8 entries), and the + // height is 512 quads + // Each VLC entry contains, in the following order, starting + // from MSB + // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits) + // Each entry in UVLC contains u_q + // One extra row to handle the case of SPP propagating downwards + // when codeblock width is 4 + ui16 scratch[8 * 513] = {0}; // 8 kB + + // We need an extra two entries (one inf and one u_q) beyond + // the last column. + // If the block width is 4 (2 quads), then we use sstr of 8 + // (enough for 4 quads). If width is 8 (4 quads) we use + // sstr is 16 (enough for 8 quads). For a width of 16 (8 + // quads), we use 24 (enough for 12 quads). + ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8 + + ui32 mmsbp2 = missing_msbs + 2; + + // The cleanup pass is decoded in two steps; in step one, + // the VLC and MEL segments are decoded, generating a record that + // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k. + // This information should be sufficient for the next step. + // In step 2, we decode the MagSgn segment. + + // step 1 decoding VLC and MEL segments + { + // init structures + dec_mel_st mel; + mel_init(&mel, coded_data, lcup, scup); + rev_struct vlc; + rev_init(&vlc, coded_data, lcup, scup); + + int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm + // data represented as runs of 0 events + // See mel_decode description + + ui32 vlc_val; + ui32 c_q = 0; + ui16 *sp = scratch; + //initial quad row + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // first quad + vlc_val = rev_fetch(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 1 in ITU T.814 + c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2); + + //remove data from vlc stream (0 bits are removed if vlc is not used) + vlc_val = rev_advance(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + //prepare context for the next quad, eqn. 1 in ITU T.814 + c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2); + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from + { // the MEL run of events + run -= 2; //subtract 2, since events number if multiplied by 2 + + uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by + // is 0x40 + + if (run < 0)//if run is consumed (run is -1 or -2), get another run + run = mel_get_run(&mel); + } + //run -= (uvlc_mode == 0xc0) ? 2 : 0; + //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + + //decode uvlc_mode to get u for both quads + ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance(&vlc, len); + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<> 3) + (tmp >> len)); //kappa == 1 + sp[3]= u_q; + } + sp[0] = sp[1] = 0; + + //non initial quad rows + for (ui32 y = 2; y < height; y += 2) + { + c_q = 0; // context + ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads + + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // sigma_q (n, ne, nf) + c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4); + + // first quad + vlc_val = rev_fetch(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number is multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 2 in ITU T.814 + // sigma_q (w, sw) + c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[0 - (si32)sstr] & 0x80; + // sigma_q (n, ne, nf) + c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4); + + //remove data from vlc stream (0 bits are removed if vlc is unused) + vlc_val = rev_advance(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + // partial c_q, will be completed when we process the next quad + // sigma_q (w, sw) + c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[2 - (si32)sstr] & 0x80; + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance(&vlc, len); + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); + sp[1] = u_q; + u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q + sp[3] = u_q; + } + sp[0] = sp[1] = 0; + } + } + + // step2 we decode magsgn + { + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We need an extra entry to handle the case of vp[1] + // when vp is at the last column. + // Here, we allocate 4 instead of 1 to make the buffer size + // a multipled of 16 bytes. + const int v_n_size = 512 + 4; + ui32 v_n_scratch[v_n_size] = {0}; // 2+ kB + + frwd_struct magsgn; + frwd_init<0xFF>(&magsgn, coded_data, lcup - scup); + + ui16 *sp = scratch; + ui32 *vp = v_n_scratch; + ui32 *dp = decoded_data; + + ui32 prev_v_n = 0; + for (ui32 x = 0; x < width; sp += 2, ++vp) + { + ui32 inf = sp[0]; + ui32 U_q = sp[1]; + if (U_q > mmsbp2) + return false; + + ui32 v_n; + ui32 val = 0; + ui32 bit = 0; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui32 ms_val = frwd_fetch<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui32 ms_val = frwd_fetch<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= width) + { ++vp; break; } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui32 ms_val = frwd_fetch<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui32 ms_val = frwd_fetch<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + + for (ui32 y = 2; y < height; y += 2) + { + ui16 *sp = scratch + (y >> 1) * sstr; + ui32 *vp = v_n_scratch; + ui32 *dp = decoded_data + y * stride; + + prev_v_n = 0; + for (ui32 x = 0; x < width; sp += 2, ++vp) + { + ui32 inf = sp[0]; + ui32 u_q = sp[1]; + + ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1? + ui32 emax = vp[0] | vp[1]; + emax = 31 - count_leading_zeros(emax | 2); // emax - 1 + ui32 kappa = gamma ? emax : 1; + + ui32 U_q = u_q + kappa; + if (U_q > mmsbp2) + return false; + + ui32 v_n; + ui32 val = 0; + ui32 bit = 0; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui32 ms_val = frwd_fetch<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui32 ms_val = frwd_fetch<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= width) + { ++vp; break; } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui32 ms_val = frwd_fetch<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui32 ms_val = frwd_fetch<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 31; // get sign bit + v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits + v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + } + } + + if (num_passes > 1) + { + // We use scratch again, we can divide it into multiple regions + // sigma holds all the significant samples, and it cannot + // be modified after it is set. it will be used during the + // Magnitude Refinement Pass + ui16* const sigma = scratch; + + ui32 mstr = (width + 3u) >> 2; // divide by 4, since each + // ui16 contains 4 columns + mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8 + + // We re-arrange quad significance, where each 4 consecutive + // bits represent one quad, into column significance, where, + // each 4 consequtive bits represent one column of 4 rows + { + ui32 y; + for (y = 0; y < height; y += 4) + { + ui16* sp = scratch + (y >> 1) * sstr; + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 4, sp += 4, ++dp) { + ui32 t0 = 0, t1 = 0; + t0 = ((sp[0 ] & 0x30u) >> 4) | ((sp[0 ] & 0xC0u) >> 2); + t0 |= ((sp[2 ] & 0x30u) << 4) | ((sp[2 ] & 0xC0u) << 6); + t1 = ((sp[0+sstr] & 0x30u) >> 2) | ((sp[0+sstr] & 0xC0u) ); + t1 |= ((sp[2+sstr] & 0x30u) << 6) | ((sp[2+sstr] & 0xC0u) << 8); + dp[0] = (ui16)(t0 | t1); + } + dp[0] = 0; // set an extra entry on the right with 0 + } + { + // reset one row after the codeblock + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 4, ++dp) + dp[0] = 0; + dp[0] = 0; // set an extra entry on the right with 0 + } + } + + // We perform Significance Propagation Pass here + { + // This stores significance information of the previous + // 4 rows. Significance information in this array includes + // all signicant samples in bitplane p - 1; that is, + // significant samples for bitplane p (discovered during the + // cleanup pass and stored in sigma) and samples that have recently + // became significant (during the SPP) in bitplane p-1. + // We store enough for the widest row, containing 1024 columns, + // which is equivalent to 256 of ui16, since each stores 4 columns. + // We add an extra 8 entries, just in case we need more + ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes + + frwd_struct sigprop; + frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 pattern = 0xFFFFu; // a pattern needed samples + if (height - y < 4) { + pattern = 0x7777u; + if (height - y < 3) { + pattern = 0x3333u; + if (height - y < 2) + pattern = 0x1111u; + } + } + + // prev holds sign. info. for the previous quad, together + // with the rows on top of it and below it. + ui32 prev = 0; + ui16 *prev_sig = prev_row_sig; + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui32 *dpp = decoded_data + y * stride; + for (ui32 x = 0; x < width; x += 4, ++cur_sig, ++prev_sig) + { + // only rows and columns inside the stripe are included + si32 s = (si32)x + 4 - (si32)width; + s = ojph_max(s, 0); + pattern = pattern >> (s * 4); + + // We first find locations that need to be tested (potential + // SPP members); these location will end up in mbr + // In each iteration, we produce 16 bits because cwd can have + // up to 16 bits of significance information, followed by the + // corresponding 16 bits of sign information; therefore, it is + // sufficient to fetch 32 bit data per loop. + + // Althougth we are interested in 16 bits only, we load 32 bits. + // For the 16 bits we are producing, we need the next 4 bits -- + // We need data for at least 5 columns out of 8. + // Therefore loading 32 bits is easier than loading 16 bits + // twice. + ui32 ps = *(ui32*)prev_sig; + ui32 ns = *(ui32*)(cur_sig + mstr); + ui32 u = (ps & 0x88888888) >> 3; // the row on top + if (!stripe_causal) + u |= (ns & 0x11111111) << 3; // the row below + + ui32 cs = *(ui32*)cur_sig; + // vertical integration + ui32 mbr = cs; // this sig. info. + mbr |= (cs & 0x77777777) << 1; //above neighbors + mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors + mbr |= u; + // horizontal integration + ui32 t = mbr; + mbr |= t << 4; // neighbors on the left + mbr |= t >> 4; // neighbors on the right + mbr |= prev >> 12; // significance of previous group + + // remove outside samples, and already significant samples + mbr &= pattern; + mbr &= ~cs; + + // find samples that become significant during the SPP + ui32 new_sig = mbr; + if (new_sig) + { + ui32 cwd = frwd_fetch<0>(&sigprop); + + ui32 cnt = 0; + ui32 col_mask = 0xFu; + ui32 inv_sig = ~cs & pattern; + for (int i = 0; i < 16; i += 4, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan one column + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x33u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x76u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xECu << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xC8u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + } + + if (new_sig) + { + // new_sig has newly-discovered sig. samples during SPP + // find the signs and update decoded_data + ui32 *dp = dpp + x; + ui32 val = 3u << (p - 2); + col_mask = 0xFu; + for (int i = 0; i < 4; ++i, ++dp, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan 4 signs + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + assert(dp[0] == 0); + dp[0] = (cwd << 31) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[stride] == 0); + dp[stride] = (cwd << 31) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[2 * stride] == 0); + dp[2 * stride] = (cwd << 31) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[3 * stride] == 0); + dp[3 * stride] = (cwd << 31) | val; + cwd >>= 1; ++cnt; + } + } + } + frwd_advance(&sigprop, cnt); + } + + new_sig |= cs; + *prev_sig = (ui16)(new_sig); + + // vertical integration for the new sig. info. + t = new_sig; + new_sig |= (t & 0x7777) << 1; //above neighbors + new_sig |= (t & 0xEEEE) >> 1; //below neighbors + // add sig. info. from the row on top and below + prev = new_sig | u; + // we need only the bits in 0xF000 + prev &= 0xF000; + } + } + } + + // We perform Magnitude Refinement Pass here + if (num_passes > 2) + { + rev_struct magref; + rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 *cur_sig = (ui32*)(sigma + (y >> 2) * mstr); + ui32 *dpp = decoded_data + y * stride; + ui32 half = 1 << (p - 2); + for (ui32 i = 0; i < width; i += 8) + { + //Process one entry from sigma array at a time + // Each nibble (4 bits) in the sigma array represents 4 rows, + // and the 32 bits contain 8 columns + ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data + ui32 sig = *cur_sig++; // 32 bit that will be processed now + ui32 col_mask = 0xFu; // a mask for a column in sig + if (sig) // if any of the 32 bits are set + { + for (int j = 0; j < 8; ++j) //one column at a time + { + if (sig & col_mask) // lowest nibble + { + ui32 *dp = dpp + i + j; // next column in decoded samples + ui32 sample_mask = 0x11111111u & col_mask; //LSB + + for (int k = 0; k < 4; ++k) { + if (sig & sample_mask) //if LSB is set + { + assert(dp[0] != 0); // decoded value cannot be zero + assert((dp[0] & half) == 0); // no half + ui32 sym = cwd & 1; // get it value + sym = (1 - sym) << (p - 1); // previous center of bin + sym |= half; // put half the center of bin + dp[0] ^= sym; // remove old bin center and put new + cwd >>= 1; // consume word + } + sample_mask += sample_mask; //next row + dp += stride; // next samples row + } + } + col_mask <<= 4; //next column + } + } + // consume data according to the number of bits set + rev_advance_mrp(&magref, population_count(sig)); + } + } + } + } + return true; + } + } +} \ No newline at end of file diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder64.cpp similarity index 61% rename from src/core/coding/ojph_block_decoder.cpp rename to src/core/coding/ojph_block_decoder64.cpp index 3ee6ca26..88017356 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder64.cpp @@ -284,145 +284,6 @@ namespace ojph { //!bits > 32) // if there are more than 32 bits in tmp, then - return; // reading 32 bits can overflow vlcp->tmp - ui32 val = 0; - //the next line (the if statement) needs to be tested first - if (vlcp->size > 3) // if there are more than 3 bytes left in VLC - { - // (vlcp->data - 3) move pointer back to read 32 bits at once - val = *(ui32*)(vlcp->data - 3); // then read 32 bits - vlcp->data -= 4; // move data pointer back by 4 - vlcp->size -= 4; // reduce available byte by 4 - } - else if (vlcp->size > 0) - { // 4 or less - int i = 24; - while (vlcp->size > 0) { - ui32 v = *vlcp->data--; // read one byte at a time - val |= (v << i); // put byte in its correct location - --vlcp->size; - i -= 8; - } - } - - //accumulate in tmp, number of bits in tmp are stored in bits - ui32 tmp = val >> 24; //start with the MSB byte - ui32 bits; - - // test unstuff (previous byte is >0x8F), and this byte is 0x7F - bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0); - bool unstuff = (val >> 24) > 0x8F; //this is for the next byte - - tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte - bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0); - unstuff = ((val >> 16) & 0xFF) > 0x8F; - - tmp |= ((val >> 8) & 0xFF) << bits; - bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0); - unstuff = ((val >> 8) & 0xFF) > 0x8F; - - tmp |= (val & 0xFF) << bits; - bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0); - unstuff = (val & 0xFF) > 0x8F; - - // now move the read and unstuffed bits into vlcp->tmp - vlcp->tmp |= (ui64)tmp << vlcp->bits; - vlcp->bits += bits; - vlcp->unstuff = unstuff; // this for the next read - } - - //************************************************************************/ - /** @brief Initiates the rev_struct structure and reads a few bytes to - * move the read address to multiple of 4 - * - * There is another similar rev_init_mrp subroutine. The difference is - * that this one, rev_init, discards the first 12 bits (they have the - * sum of the lengths of VLC and MEL segments), and first unstuff depends - * on first 4 bits. - * - * @param [in] vlcp is a pointer to rev_struct structure - * @param [in] data is a pointer to byte at the start of the cleanup pass - * @param [in] lcup is the length of MagSgn+MEL+VLC segments - * @param [in] scup is the length of MEL+VLC segments - */ - static inline - void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup) - { - //first byte has only the upper 4 bits - vlcp->data = data + lcup - 2; - - //size can not be larger than this, in fact it should be smaller - vlcp->size = scup - 2; - - ui32 d = *vlcp->data--; // read one byte (this is a half byte) - vlcp->tmp = d >> 4; // both initialize and set - vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard - vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte - - //This code is designed for an architecture that read address should - // align to the read size (address multiple of 4 if read size is 4) - //These few lines take care of the case where data is not at a multiple - // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream. - // To read 32 bits, read from (vlcp->data - 3) - int num = 1 + (int)(intptr_t(vlcp->data) & 0x3); - int tnum = num < vlcp->size ? num : vlcp->size; - for (int i = 0; i < tnum; ++i) { - ui64 d; - d = *vlcp->data--; // read one byte and move read pointer - //check if the last byte was >0x8F (unstuff == true) and this is 0x7F - ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); - vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp - vlcp->bits += d_bits; - vlcp->unstuff = d > 0x8F; // for next byte - } - vlcp->size -= tnum; - rev_read(vlcp); // read another 32 buts - } - - //************************************************************************/ - /** @brief Retrieves 32 bits from the head of a rev_struct structure - * - * By the end of this call, vlcp->tmp must have no less than 33 bits - * - * @param [in] vlcp is a pointer to rev_struct structure - */ - static inline - ui32 rev_fetch(rev_struct *vlcp) - { - if (vlcp->bits < 32) // if there are less then 32 bits, read more - { - rev_read(vlcp); // read 32 bits, but unstuffing might reduce this - if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits - rev_read(vlcp); // read another 32 - } - return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp - } - //************************************************************************/ /** @brief Read and unstuff data from a backwardly-growing segment * @@ -514,21 +375,6 @@ namespace ojph { return vlcp->tmp; // return unstuff decoded bits } - //************************************************************************/ - /** @brief Consumes num_bits from a rev_struct structure - * - * @param [in] vlcp is a pointer to rev_struct structure - * @param [in] num_bits is the number of bits to be removed - */ - static inline - ui32 rev_advance(rev_struct *vlcp, ui32 num_bits) - { - assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits - vlcp->tmp >>= num_bits; // remove bits - vlcp->bits -= num_bits; // decrement the number of bits - return (ui32)vlcp->tmp; - } - //************************************************************************/ /** @brief Consumes num_bits from a rev_struct structure * @@ -900,897 +746,6 @@ namespace ojph { return msp->tmp; } - //************************************************************************/ - /** @brief Decodes one codeblock, processing the cleanup, siginificance - * propagation, and magnitude refinement pass - * - * @param [in] coded_data is a pointer to bitstream - * @param [in] decoded_data is a pointer to decoded codeblock data buf. - * @param [in] missing_msbs is the number of missing MSBs - * @param [in] num_passes is the number of passes: 1 if CUP only, - * 2 for CUP+SPP, and 3 for CUP+SPP+MRP - * @param [in] lengths1 is the length of cleanup pass - * @param [in] lengths2 is the length of refinement passes (either SPP - * only or SPP+MRP) - * @param [in] width is the decoded codeblock width - * @param [in] height is the decoded codeblock height - * @param [in] stride is the decoded codeblock buffer stride - * @param [in] stripe_causal is true for stripe causal mode - */ - bool ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data, - ui32 missing_msbs, ui32 num_passes, - ui32 lengths1, ui32 lengths2, - ui32 width, ui32 height, ui32 stride, - bool stripe_causal) - { - static bool insufficient_precision = false; - static bool modify_code = false; - static bool truncate_spp_mrp = false; - - if (num_passes > 1 && lengths2 == 0) - { - OJPH_WARN(0x00010001, "A malformed codeblock that has more than " - "one coding pass, but zero length for " - "2nd and potential 3rd pass."); - num_passes = 1; - } - - if (num_passes > 3) - { - OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " - "This codeblocks has %d passes.", - num_passes); - return false; - } - - if (missing_msbs > 30) // p < 0 - { - if (insufficient_precision == false) - { - insufficient_precision = true; - OJPH_WARN(0x00010003, "32 bits are not enough to decode this " - "codeblock. This message will not be " - "displayed again."); - } - return false; - } - else if (missing_msbs == 30) // p == 0 - { // not enough precision to decode and set the bin center to 1 - if (modify_code == false) { - modify_code = true; - OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " - "pass. The code can be modified to support " - "this case. This message will not be " - "displayed again."); - } - return false; // 32 bits are not enough to decode this - } - else if (missing_msbs == 29) // if p is 1, then num_passes must be 1 - { - if (num_passes > 1) { - num_passes = 1; - if (truncate_spp_mrp == false) { - truncate_spp_mrp = true; - OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " - "nor MagRef passes; both will be skipped. " - "This message will not be displayed " - "again."); - } - } - } - ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP - // There is a way to handle the case of p == 0, but a different path - // is required - - if (lengths1 < 2) - { - OJPH_WARN(0x00010006, "Wrong codeblock length."); - return false; - } - - // read scup and fix the bytes there - int lcup, scup; - lcup = (int)lengths1; // length of CUP - //scup is the length of MEL + VLC - scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF); - if (scup < 2 || scup > lcup || scup > 4079) //something is wrong - return false; - - // The temporary storage scratch holds two types of data in an - // interleaved fashion. The interleaving allows us to use one - // memory pointer. - // We have one entry for a decoded VLC code, and one entry for UVLC. - // Entries are 16 bits each, corresponding to one quad, - // but since we want to use XMM registers of the SSE family - // of SIMD; we allocated 16 bytes or more per quad row; that is, - // the width is no smaller than 16 bytes (or 8 entries), and the - // height is 512 quads - // Each VLC entry contains, in the following order, starting - // from MSB - // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits) - // Each entry in UVLC contains u_q - // One extra row to handle the case of SPP propagating downwards - // when codeblock width is 4 - ui16 scratch[8 * 513] = {0}; // 8 kB - - // We need an extra two entries (one inf and one u_q) beyond - // the last column. - // If the block width is 4 (2 quads), then we use sstr of 8 - // (enough for 4 quads). If width is 8 (4 quads) we use - // sstr is 16 (enough for 8 quads). For a width of 16 (8 - // quads), we use 24 (enough for 12 quads). - ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8 - - ui32 mmsbp2 = missing_msbs + 2; - - // The cleanup pass is decoded in two steps; in step one, - // the VLC and MEL segments are decoded, generating a record that - // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k. - // This information should be sufficient for the next step. - // In step 2, we decode the MagSgn segment. - - // step 1 decoding VLC and MEL segments - { - // init structures - dec_mel_st mel; - mel_init(&mel, coded_data, lcup, scup); - rev_struct vlc; - rev_init(&vlc, coded_data, lcup, scup); - - int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm - // data represented as runs of 0 events - // See mel_decode description - - ui32 vlc_val; - ui32 c_q = 0; - ui16 *sp = scratch; - //initial quad row - for (ui32 x = 0; x < width; sp += 4) - { - // decode VLC - ///////////// - - // first quad - vlc_val = rev_fetch(&vlc); - - //decode VLC using the context c_q and the head of VLC bitstream - ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ]; - - // if context is zero, use one MEL event - if (c_q == 0) //zero context - { - run -= 2; //subtract 2, since events number if multiplied by 2 - - // Is the run terminated in 1? if so, use decoded VLC code, - // otherwise, discard decoded data, since we will decoded again - // using a different context - t0 = (run == -1) ? t0 : 0; - - // is run -1 or -2? this means a run has been consumed - if (run < 0) - run = mel_get_run(&mel); // get another run - } - //run -= (c_q == 0) ? 2 : 0; - //t0 = (c_q != 0 || run == -1) ? t0 : 0; - //if (run < 0) - // run = mel_get_run(&mel); // get another run - sp[0] = t0; - x += 2; - - // prepare context for the next quad; eqn. 1 in ITU T.814 - c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2); - - //remove data from vlc stream (0 bits are removed if vlc is not used) - vlc_val = rev_advance(&vlc, t0 & 0x7); - - //second quad - ui16 t1 = 0; - - //decode VLC using the context c_q and the head of VLC bitstream - t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; - - // if context is zero, use one MEL event - if (c_q == 0 && x < width) //zero context - { - run -= 2; //subtract 2, since events number if multiplied by 2 - - // if event is 0, discard decoded t1 - t1 = (run == -1) ? t1 : 0; - - if (run < 0) // have we consumed all events in a run - run = mel_get_run(&mel); // if yes, then get another run - } - t1 = x < width ? t1 : 0; - //run -= (c_q == 0 && x < width) ? 2 : 0; - //t1 = (c_q != 0 || run == -1) ? t1 : 0; - //if (run < 0) - // run = mel_get_run(&mel); // get another run - sp[2] = t1; - x += 2; - - //prepare context for the next quad, eqn. 1 in ITU T.814 - c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2); - - //remove data from vlc stream, if qinf is not used, cwdlen is 0 - vlc_val = rev_advance(&vlc, t1 & 0x7); - - // decode u - ///////////// - // uvlc_mode is made up of u_offset bits from the quad pair - ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); - if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from - { // the MEL run of events - run -= 2; //subtract 2, since events number if multiplied by 2 - - uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by - // is 0x40 - - if (run < 0)//if run is consumed (run is -1 or -2), get another run - run = mel_get_run(&mel); - } - //run -= (uvlc_mode == 0xc0) ? 2 : 0; - //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0; - //if (run < 0) - // run = mel_get_run(&mel); // get another run - - //decode uvlc_mode to get u for both quads - ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)]; - //remove total prefix length - vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); - uvlc_entry >>= 3; - //extract suffixes for quad 0 and 1 - ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads - ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads - vlc_val = rev_advance(&vlc, len); - uvlc_entry >>= 4; - // quad 0 length - len = uvlc_entry & 0x7; // quad 0 suffix length - uvlc_entry >>= 3; - ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<> 3) + (tmp >> len)); //kappa == 1 - sp[3]= u_q; - } - sp[0] = sp[1] = 0; - - //non initial quad rows - for (ui32 y = 2; y < height; y += 2) - { - c_q = 0; // context - ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads - - for (ui32 x = 0; x < width; sp += 4) - { - // decode VLC - ///////////// - - // sigma_q (n, ne, nf) - c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2); - c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4); - - // first quad - vlc_val = rev_fetch(&vlc); - - //decode VLC using the context c_q and the head of VLC bitstream - ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ]; - - // if context is zero, use one MEL event - if (c_q == 0) //zero context - { - run -= 2; //subtract 2, since events number is multiplied by 2 - - // Is the run terminated in 1? if so, use decoded VLC code, - // otherwise, discard decoded data, since we will decoded again - // using a different context - t0 = (run == -1) ? t0 : 0; - - // is run -1 or -2? this means a run has been consumed - if (run < 0) - run = mel_get_run(&mel); // get another run - } - //run -= (c_q == 0) ? 2 : 0; - //t0 = (c_q != 0 || run == -1) ? t0 : 0; - //if (run < 0) - // run = mel_get_run(&mel); // get another run - sp[0] = t0; - x += 2; - - // prepare context for the next quad; eqn. 2 in ITU T.814 - // sigma_q (w, sw) - c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1); - // sigma_q (nw) - c_q |= sp[0 - (si32)sstr] & 0x80; - // sigma_q (n, ne, nf) - c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2); - c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4); - - //remove data from vlc stream (0 bits are removed if vlc is unused) - vlc_val = rev_advance(&vlc, t0 & 0x7); - - //second quad - ui16 t1 = 0; - - //decode VLC using the context c_q and the head of VLC bitstream - t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; - - // if context is zero, use one MEL event - if (c_q == 0 && x < width) //zero context - { - run -= 2; //subtract 2, since events number if multiplied by 2 - - // if event is 0, discard decoded t1 - t1 = (run == -1) ? t1 : 0; - - if (run < 0) // have we consumed all events in a run - run = mel_get_run(&mel); // if yes, then get another run - } - t1 = x < width ? t1 : 0; - //run -= (c_q == 0 && x < width) ? 2 : 0; - //t1 = (c_q != 0 || run == -1) ? t1 : 0; - //if (run < 0) - // run = mel_get_run(&mel); // get another run - sp[2] = t1; - x += 2; - - // partial c_q, will be completed when we process the next quad - // sigma_q (w, sw) - c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1); - // sigma_q (nw) - c_q |= sp[2 - (si32)sstr] & 0x80; - - //remove data from vlc stream, if qinf is not used, cwdlen is 0 - vlc_val = rev_advance(&vlc, t1 & 0x7); - - // decode u - ///////////// - // uvlc_mode is made up of u_offset bits from the quad pair - ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); - ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)]; - //remove total prefix length - vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); - uvlc_entry >>= 3; - //extract suffixes for quad 0 and 1 - ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads - ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads - vlc_val = rev_advance(&vlc, len); - uvlc_entry >>= 4; - // quad 0 length - len = uvlc_entry & 0x7; // quad 0 suffix length - uvlc_entry >>= 3; - ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); - sp[1] = u_q; - u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q - sp[3] = u_q; - } - sp[0] = sp[1] = 0; - } - } - - // step2 we decode magsgn - { - // We allocate a scratch row for storing v_n values. - // We have 512 quads horizontally. - // We need an extra entry to handle the case of vp[1] - // when vp is at the last column. - // Here, we allocate 4 instead of 1 to make the buffer size - // a multipled of 16 bytes. - const int v_n_size = 512 + 4; - ui32 v_n_scratch[v_n_size] = {0}; // 2+ kB - - frwd_struct magsgn; - frwd_init<0xFF>(&magsgn, coded_data, lcup - scup); - - ui16 *sp = scratch; - ui32 *vp = v_n_scratch; - ui32 *dp = decoded_data; - - ui32 prev_v_n = 0; - for (ui32 x = 0; x < width; sp += 2, ++vp) - { - ui32 inf = sp[0]; - ui32 U_q = sp[1]; - if (U_q > mmsbp2) - return false; - - ui32 v_n; - ui32 val = 0; - ui32 bit = 0; - if (inf & (1 << (4 + bit))) - { - //get 32 bits of magsgn data - ui32 ms_val = frwd_fetch<0xFF>(&magsgn); - ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k - frwd_advance(&magsgn, m_n); //consume m_n - - val = ms_val << 31; // get sign bit - v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits - v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB - v_n |= 1; // add center of bin - //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit - //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs - val |= (v_n + 2) << (p - 1); - } - dp[0] = val; - - v_n = 0; - val = 0; - bit = 1; - if (inf & (1 << (4 + bit))) - { - //get 32 bits of magsgn data - ui32 ms_val = frwd_fetch<0xFF>(&magsgn); - ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k - frwd_advance(&magsgn, m_n); //consume m_n - - val = ms_val << 31; // get sign bit - v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits - v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB - v_n |= 1; // add center of bin - //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit - //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs - val |= (v_n + 2) << (p - 1); - } - dp[stride] = val; - vp[0] = prev_v_n | v_n; - prev_v_n = 0; - ++dp; - if (++x >= width) - { ++vp; break; } - - val = 0; - bit = 2; - if (inf & (1 << (4 + bit))) - { - //get 32 bits of magsgn data - ui32 ms_val = frwd_fetch<0xFF>(&magsgn); - ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k - frwd_advance(&magsgn, m_n); //consume m_n - - val = ms_val << 31; // get sign bit - v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits - v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB - v_n |= 1; // add center of bin - //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit - //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs - val |= (v_n + 2) << (p - 1); - } - dp[0] = val; - - v_n = 0; - val = 0; - bit = 3; - if (inf & (1 << (4 + bit))) - { - //get 32 bits of magsgn data - ui32 ms_val = frwd_fetch<0xFF>(&magsgn); - ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k - frwd_advance(&magsgn, m_n); //consume m_n - - val = ms_val << 31; // get sign bit - v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits - v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB - v_n |= 1; // add center of bin - //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit - //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs - val |= (v_n + 2) << (p - 1); - } - dp[stride] = val; - prev_v_n = v_n; - ++dp; - ++x; - } - vp[0] = prev_v_n; - - for (ui32 y = 2; y < height; y += 2) - { - ui16 *sp = scratch + (y >> 1) * sstr; - ui32 *vp = v_n_scratch; - ui32 *dp = decoded_data + y * stride; - - prev_v_n = 0; - for (ui32 x = 0; x < width; sp += 2, ++vp) - { - ui32 inf = sp[0]; - ui32 u_q = sp[1]; - - ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1? - ui32 emax = vp[0] | vp[1]; - emax = 31 - count_leading_zeros(emax | 2); // emax - 1 - ui32 kappa = gamma ? emax : 1; - - ui32 U_q = u_q + kappa; - if (U_q > mmsbp2) - return false; - - ui32 v_n; - ui32 val = 0; - ui32 bit = 0; - if (inf & (1 << (4 + bit))) - { - //get 32 bits of magsgn data - ui32 ms_val = frwd_fetch<0xFF>(&magsgn); - ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k - frwd_advance(&magsgn, m_n); //consume m_n - - val = ms_val << 31; // get sign bit - v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits - v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB - v_n |= 1; // add center of bin - //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit - //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs - val |= (v_n + 2) << (p - 1); - } - dp[0] = val; - - v_n = 0; - val = 0; - bit = 1; - if (inf & (1 << (4 + bit))) - { - //get 32 bits of magsgn data - ui32 ms_val = frwd_fetch<0xFF>(&magsgn); - ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k - frwd_advance(&magsgn, m_n); //consume m_n - - val = ms_val << 31; // get sign bit - v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits - v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB - v_n |= 1; // add center of bin - //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit - //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs - val |= (v_n + 2) << (p - 1); - } - dp[stride] = val; - vp[0] = prev_v_n | v_n; - prev_v_n = 0; - ++dp; - if (++x >= width) - { ++vp; break; } - - val = 0; - bit = 2; - if (inf & (1 << (4 + bit))) - { - //get 32 bits of magsgn data - ui32 ms_val = frwd_fetch<0xFF>(&magsgn); - ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k - frwd_advance(&magsgn, m_n); //consume m_n - - val = ms_val << 31; // get sign bit - v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits - v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB - v_n |= 1; // add center of bin - //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit - //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs - val |= (v_n + 2) << (p - 1); - } - dp[0] = val; - - v_n = 0; - val = 0; - bit = 3; - if (inf & (1 << (4 + bit))) - { - //get 32 bits of magsgn data - ui32 ms_val = frwd_fetch<0xFF>(&magsgn); - ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k - frwd_advance(&magsgn, m_n); //consume m_n - - val = ms_val << 31; // get sign bit - v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits - v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB - v_n |= 1; // add center of bin - //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit - //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs - val |= (v_n + 2) << (p - 1); - } - dp[stride] = val; - prev_v_n = v_n; - ++dp; - ++x; - } - vp[0] = prev_v_n; - } - } - - if (num_passes > 1) - { - // We use scratch again, we can divide it into multiple regions - // sigma holds all the significant samples, and it cannot - // be modified after it is set. it will be used during the - // Magnitude Refinement Pass - ui16* const sigma = scratch; - - ui32 mstr = (width + 3u) >> 2; // divide by 4, since each - // ui16 contains 4 columns - mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8 - - // We re-arrange quad significance, where each 4 consecutive - // bits represent one quad, into column significance, where, - // each 4 consequtive bits represent one column of 4 rows - { - ui32 y; - for (y = 0; y < height; y += 4) - { - ui16* sp = scratch + (y >> 1) * sstr; - ui16* dp = sigma + (y >> 2) * mstr; - for (ui32 x = 0; x < width; x += 4, sp += 4, ++dp) { - ui32 t0 = 0, t1 = 0; - t0 = ((sp[0 ] & 0x30u) >> 4) | ((sp[0 ] & 0xC0u) >> 2); - t0 |= ((sp[2 ] & 0x30u) << 4) | ((sp[2 ] & 0xC0u) << 6); - t1 = ((sp[0+sstr] & 0x30u) >> 2) | ((sp[0+sstr] & 0xC0u) ); - t1 |= ((sp[2+sstr] & 0x30u) << 6) | ((sp[2+sstr] & 0xC0u) << 8); - dp[0] = (ui16)(t0 | t1); - } - dp[0] = 0; // set an extra entry on the right with 0 - } - { - // reset one row after the codeblock - ui16* dp = sigma + (y >> 2) * mstr; - for (ui32 x = 0; x < width; x += 4, ++dp) - dp[0] = 0; - dp[0] = 0; // set an extra entry on the right with 0 - } - } - - // We perform Significance Propagation Pass here - { - // This stores significance information of the previous - // 4 rows. Significance information in this array includes - // all signicant samples in bitplane p - 1; that is, - // significant samples for bitplane p (discovered during the - // cleanup pass and stored in sigma) and samples that have recently - // became significant (during the SPP) in bitplane p-1. - // We store enough for the widest row, containing 1024 columns, - // which is equivalent to 256 of ui16, since each stores 4 columns. - // We add an extra 8 entries, just in case we need more - ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes - - frwd_struct sigprop; - frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2); - - for (ui32 y = 0; y < height; y += 4) - { - ui32 pattern = 0xFFFFu; // a pattern needed samples - if (height - y < 4) { - pattern = 0x7777u; - if (height - y < 3) { - pattern = 0x3333u; - if (height - y < 2) - pattern = 0x1111u; - } - } - - // prev holds sign. info. for the previous quad, together - // with the rows on top of it and below it. - ui32 prev = 0; - ui16 *prev_sig = prev_row_sig; - ui16 *cur_sig = sigma + (y >> 2) * mstr; - ui32 *dpp = decoded_data + y * stride; - for (ui32 x = 0; x < width; x += 4, ++cur_sig, ++prev_sig) - { - // only rows and columns inside the stripe are included - si32 s = (si32)x + 4 - (si32)width; - s = ojph_max(s, 0); - pattern = pattern >> (s * 4); - - // We first find locations that need to be tested (potential - // SPP members); these location will end up in mbr - // In each iteration, we produce 16 bits because cwd can have - // up to 16 bits of significance information, followed by the - // corresponding 16 bits of sign information; therefore, it is - // sufficient to fetch 32 bit data per loop. - - // Althougth we are interested in 16 bits only, we load 32 bits. - // For the 16 bits we are producing, we need the next 4 bits -- - // We need data for at least 5 columns out of 8. - // Therefore loading 32 bits is easier than loading 16 bits - // twice. - ui32 ps = *(ui32*)prev_sig; - ui32 ns = *(ui32*)(cur_sig + mstr); - ui32 u = (ps & 0x88888888) >> 3; // the row on top - if (!stripe_causal) - u |= (ns & 0x11111111) << 3; // the row below - - ui32 cs = *(ui32*)cur_sig; - // vertical integration - ui32 mbr = cs; // this sig. info. - mbr |= (cs & 0x77777777) << 1; //above neighbors - mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors - mbr |= u; - // horizontal integration - ui32 t = mbr; - mbr |= t << 4; // neighbors on the left - mbr |= t >> 4; // neighbors on the right - mbr |= prev >> 12; // significance of previous group - - // remove outside samples, and already significant samples - mbr &= pattern; - mbr &= ~cs; - - // find samples that become significant during the SPP - ui32 new_sig = mbr; - if (new_sig) - { - ui32 cwd = frwd_fetch<0>(&sigprop); - - ui32 cnt = 0; - ui32 col_mask = 0xFu; - ui32 inv_sig = ~cs & pattern; - for (int i = 0; i < 16; i += 4, col_mask <<= 4) - { - if ((col_mask & new_sig) == 0) - continue; - - //scan one column - ui32 sample_mask = 0x1111u & col_mask; - if (new_sig & sample_mask) - { - new_sig &= ~sample_mask; - if (cwd & 1) - { - ui32 t = 0x33u << i; - new_sig |= t & inv_sig; - } - cwd >>= 1; ++cnt; - } - - sample_mask <<= 1; - if (new_sig & sample_mask) - { - new_sig &= ~sample_mask; - if (cwd & 1) - { - ui32 t = 0x76u << i; - new_sig |= t & inv_sig; - } - cwd >>= 1; ++cnt; - } - - sample_mask <<= 1; - if (new_sig & sample_mask) - { - new_sig &= ~sample_mask; - if (cwd & 1) - { - ui32 t = 0xECu << i; - new_sig |= t & inv_sig; - } - cwd >>= 1; ++cnt; - } - - sample_mask <<= 1; - if (new_sig & sample_mask) - { - new_sig &= ~sample_mask; - if (cwd & 1) - { - ui32 t = 0xC8u << i; - new_sig |= t & inv_sig; - } - cwd >>= 1; ++cnt; - } - } - - if (new_sig) - { - // new_sig has newly-discovered sig. samples during SPP - // find the signs and update decoded_data - ui32 *dp = dpp + x; - ui32 val = 3u << (p - 2); - col_mask = 0xFu; - for (int i = 0; i < 4; ++i, ++dp, col_mask <<= 4) - { - if ((col_mask & new_sig) == 0) - continue; - - //scan 4 signs - ui32 sample_mask = 0x1111u & col_mask; - if (new_sig & sample_mask) - { - assert(dp[0] == 0); - dp[0] = (cwd << 31) | val; - cwd >>= 1; ++cnt; - } - - sample_mask += sample_mask; - if (new_sig & sample_mask) - { - assert(dp[stride] == 0); - dp[stride] = (cwd << 31) | val; - cwd >>= 1; ++cnt; - } - - sample_mask += sample_mask; - if (new_sig & sample_mask) - { - assert(dp[2 * stride] == 0); - dp[2 * stride] = (cwd << 31) | val; - cwd >>= 1; ++cnt; - } - - sample_mask += sample_mask; - if (new_sig & sample_mask) - { - assert(dp[3 * stride] == 0); - dp[3 * stride] = (cwd << 31) | val; - cwd >>= 1; ++cnt; - } - } - } - frwd_advance(&sigprop, cnt); - } - - new_sig |= cs; - *prev_sig = (ui16)(new_sig); - - // vertical integration for the new sig. info. - t = new_sig; - new_sig |= (t & 0x7777) << 1; //above neighbors - new_sig |= (t & 0xEEEE) >> 1; //below neighbors - // add sig. info. from the row on top and below - prev = new_sig | u; - // we need only the bits in 0xF000 - prev &= 0xF000; - } - } - } - - // We perform Magnitude Refinement Pass here - if (num_passes > 2) - { - rev_struct magref; - rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2); - - for (ui32 y = 0; y < height; y += 4) - { - ui32 *cur_sig = (ui32*)(sigma + (y >> 2) * mstr); - ui32 *dpp = decoded_data + y * stride; - ui32 half = 1 << (p - 2); - for (ui32 i = 0; i < width; i += 8) - { - //Process one entry from sigma array at a time - // Each nibble (4 bits) in the sigma array represents 4 rows, - // and the 32 bits contain 8 columns - ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data - ui32 sig = *cur_sig++; // 32 bit that will be processed now - ui32 col_mask = 0xFu; // a mask for a column in sig - if (sig) // if any of the 32 bits are set - { - for (int j = 0; j < 8; ++j) //one column at a time - { - if (sig & col_mask) // lowest nibble - { - ui32 *dp = dpp + i + j; // next column in decoded samples - ui32 sample_mask = 0x11111111u & col_mask; //LSB - - for (int k = 0; k < 4; ++k) { - if (sig & sample_mask) //if LSB is set - { - assert(dp[0] != 0); // decoded value cannot be zero - assert((dp[0] & half) == 0); // no half - ui32 sym = cwd & 1; // get it value - sym = (1 - sym) << (p - 1); // previous center of bin - sym |= half; // put half the center of bin - dp[0] ^= sym; // remove old bin center and put new - cwd >>= 1; // consume word - } - sample_mask += sample_mask; //next row - dp += stride; // next samples row - } - } - col_mask <<= 4; //next column - } - } - // consume data according to the number of bits set - rev_advance_mrp(&magref, population_count(sig)); - } - } - } - } - return true; - } - //************************************************************************/ /** @brief Decodes one codeblock, processing the cleanup, siginificance * propagation, and magnitude refinement pass From e80391414e86eb9009ec782055d7ded2af39e2ec Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Thu, 7 Nov 2024 12:51:59 +1100 Subject: [PATCH 64/78] Automated SPqcd to use the LSBs as well. --- src/core/codestream/ojph_params.cpp | 12 ++++++------ src/core/codestream/ojph_params_local.h | 15 +++++++++++++-- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index 6895d1be..04e52a63 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -952,16 +952,16 @@ namespace ojph { int s = 0; double bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true); ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << reversible_SPqcd_shift); + u8_SPqcd[s++] = encode_SPqcd((ui8)(B + X)); for (ui32 d = num_decomps; d > 0; --d) { double bibo_l = bibo_gains::get_bibo_gain_l(d, true); double bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true); X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << reversible_SPqcd_shift); - u8_SPqcd[s++] = (ui8)((B + X) << reversible_SPqcd_shift); + u8_SPqcd[s++] = encode_SPqcd((ui8)(B + X)); + u8_SPqcd[s++] = encode_SPqcd((ui8)(B + X)); X = (ui32) ceil(log(bibo_h * bibo_h) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << reversible_SPqcd_shift); + u8_SPqcd[s++] = encode_SPqcd((ui8)(B + X)); } } @@ -1018,7 +1018,7 @@ namespace ojph { int irrev = Sqcd & 0x1F; if (irrev == 0) //reversible for (ui32 i = 0; i < num_subbands; ++i) { - ui32 t = (u8_SPqcd[i] >> reversible_SPqcd_shift); + ui32 t = decode_SPqcd(u8_SPqcd[i]); t += get_num_guard_bits() - 1u; B = ojph_max(B, t); } @@ -1093,7 +1093,7 @@ namespace ojph { int irrev = Sqcd & 0x1F; if (irrev == 0) // reversible; this is (10.22) from the J2K book { - num_bits += u8_SPqcd[idx] >> reversible_SPqcd_shift; + num_bits += decode_SPqcd(u8_SPqcd[idx]); num_bits = num_bits == 0 ? 0 : num_bits - 1; } else if (irrev == 1) diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 0ebdd537..04e6534f 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -590,7 +590,7 @@ namespace ojph { { friend ::ojph::param_qcd; public: - param_qcd() : reversible_SPqcd_shift(3) + param_qcd() : reversible_SPqcd_shift(3), old_SPqcd(false) { Lqcd = 0; Sqcd = 0; @@ -643,10 +643,21 @@ namespace ojph { bool is_employing_color_transform); void set_irrev_quant(ui32 num_decomps); - protected: + ui8 decode_SPqcd(ui8 v) const + { + if (old_SPqcd) return (ui8)(v >> reversible_SPqcd_shift); // old + else return (ui8)((v << 6) | (v >> 3)); // new + } + ui8 encode_SPqcd(ui8 v) const + { + if (old_SPqcd) return (ui8)(v << reversible_SPqcd_shift); // old + else return (ui8)((v >> 6) | (v << 3)); // new + } + protected: ui16 Lqcd; ui8 Sqcd; const ui8 reversible_SPqcd_shift; + const bool old_SPqcd; union { ui8 u8_SPqcd[97]; From b33083aab7aa8f1d1a9136cd2827285d4440c53c Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Thu, 7 Nov 2024 17:44:37 +1100 Subject: [PATCH 65/78] AVX2 is done. Bug fix for SPqcd. --- src/core/codestream/ojph_params_local.h | 7 +- src/core/transform/ojph_colour.cpp | 6 +- src/core/transform/ojph_colour_avx2.cpp | 334 ++++++++++++++++++------ 3 files changed, 269 insertions(+), 78 deletions(-) diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 04e6534f..905e0306 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -646,12 +646,15 @@ namespace ojph { ui8 decode_SPqcd(ui8 v) const { if (old_SPqcd) return (ui8)(v >> reversible_SPqcd_shift); // old - else return (ui8)((v << 6) | (v >> 3)); // new + else { + v = v & 0b11111011; + return (ui8)((v << 5) | (v >> 3)); // new + } } ui8 encode_SPqcd(ui8 v) const { if (old_SPqcd) return (ui8)(v << reversible_SPqcd_shift); // old - else return (ui8)((v >> 6) | (v << 3)); // new + else return (ui8)((v >> 5) | (v << 3)); // new } protected: ui16 Lqcd; diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index a4effbf7..54077f50 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -163,9 +163,9 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { - //rev_convert = avx2_rev_convert; - //rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3; - // rct_forward = avx2_rct_forward; + rev_convert = avx2_rev_convert; + rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3; + rct_forward = avx2_rct_forward; rct_backward = avx2_rct_backward; } #endif // !OJPH_DISABLE_AVX2 diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 9e550d3f..324a5a9c 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -61,66 +61,272 @@ namespace ojph { return result; } - ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void avx2_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) { - __m256i sh = _mm256_set1_epi32(shift); - for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i sh = _mm256_set1_epi32((si32)shift); + for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + { + __m256i s = _mm256_loadu_si256((__m256i*)sp); + s = _mm256_add_epi32(s, sh); + _mm256_storeu_si256((__m256i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m256i zero = _mm256_setzero_si256(); + __m256i sh = _mm256_set1_epi64x(shift); + for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + { + __m256i s, t; + s = _mm256_loadu_si256((__m256i*)sp); + + t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0)); + t = _mm256_add_epi64(t, sh); + _mm256_storeu_si256((__m256i*)dp, t); + + t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1)); + t = _mm256_add_epi64(t, sh); + _mm256_storeu_si256((__m256i*)dp + 1, t); + } + } + } + else { - __m256i s = _mm256_loadu_si256((__m256i*)sp); - s = _mm256_add_epi32(s, sh); - _mm256_storeu_si256((__m256i*)dp, s); + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + 0, (si64)ULLONG_MAX); + __m256i sh = _mm256_set1_epi64x(shift); + for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + { + __m256i s, t; + s = _mm256_loadu_si256((__m256i*)sp); + s = _mm256_add_epi64(s, sh); + + t = _mm256_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm256_and_si256(low_bits, t); + + s = _mm256_loadu_si256((__m256i*)sp + 1); + s = _mm256_add_epi64(s, sh); + + s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0)); + s = _mm256_andnot_si256(low_bits, s); + + t = _mm256_or_si256(s, t); + t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_si256((__m256i*)dp, t); + } } } ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, - int shift, ui32 width) + void avx2_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) { - __m256i sh = _mm256_set1_epi32(-shift); - __m256i zero = _mm256_setzero_si256(); - for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i sh = _mm256_set1_epi32((si32)(-shift)); + __m256i zero = _mm256_setzero_si256(); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + __m256i s = _mm256_loadu_si256((__m256i*)sp); + __m256i c = _mm256_cmpgt_epi32(zero, s); // 0xFFFFFFFF for -ve val + __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value + v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only -shift-val + s = _mm256_andnot_si256(c, s); // keep only +ve or 0 + s = _mm256_or_si256(s, v_m_sh); // combine + _mm256_storeu_si256((__m256i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m256i sh = _mm256_set1_epi64x(-shift); + __m256i zero = _mm256_setzero_si256(); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + __m256i s, t, u0, u1, c, v_m_sh; + s = _mm256_loadu_si256((__m256i*)sp); + + t = _mm256_cmpgt_epi32(zero, s); // find -ve 32bit -1 + u0 = _mm256_unpacklo_epi32(s, t); // correct 64bit data + c = _mm256_unpacklo_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm256_sub_epi64(sh, u0); // - shift - value + v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value + u0 = _mm256_andnot_si256(c, u0); // keep only +ve or 0 + u0 = _mm256_or_si256(u0, v_m_sh); // combine + + u1 = _mm256_unpackhi_epi32(s, t); // correct 64bit data + c = _mm256_unpackhi_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm256_sub_epi64(sh, u1); // - shift - value + v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value + u1 = _mm256_andnot_si256(c, u1); // keep only +ve or 0 + u1 = _mm256_or_si256(u1, v_m_sh); // combine + + t = _mm256_permute2x128_si256(u0, u1, (2 << 4) | 0); + _mm256_storeu_si256((__m256i*)dp, t); + + t = _mm256_permute2x128_si256(u0, u1, (3 << 4) | 1); + _mm256_storeu_si256((__m256i*)dp + 1, t); + } + } + } + else { - __m256i s = _mm256_loadu_si256((__m256i*)sp); - __m256i c = _mm256_cmpgt_epi32(s, zero); // 0xFFFFFFFF for +ve value - __m256i z = _mm256_cmpeq_epi32(s, zero); // 0xFFFFFFFF for 0 - c = _mm256_or_si256(c, z); // 0xFFFFFFFF for +ve and 0 - - __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value - v_m_sh = _mm256_andnot_si256(c, v_m_sh); // keep only - shift - value - s = _mm256_and_si256(c, s); // keep only +ve or 0 - s = _mm256_or_si256(s, v_m_sh); // combine - _mm256_storeu_si256((__m256i*)dp, s); + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i sh = _mm256_set1_epi64x(-shift); + __m256i zero = _mm256_setzero_si256(); + __m256i half_mask = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + 0, (si64)ULLONG_MAX); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + __m256i s, t, p, n, m, tm; + s = _mm256_loadu_si256((__m256i*)sp); + + m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value + tm = _mm256_sub_epi64(sh, s); // - shift - value + n = _mm256_and_si256(m, tm); // -ve + p = _mm256_andnot_si256(m, s); // +ve + tm = _mm256_or_si256(n, p); + tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm256_and_si256(half_mask, tm); + + s = _mm256_loadu_si256((__m256i*)sp + 1); + m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value + tm = _mm256_sub_epi64(sh, s); // - shift - value + n = _mm256_and_si256(m, tm); // -ve + p = _mm256_andnot_si256(m, s); // +ve + tm = _mm256_or_si256(n, p); + tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0)); + tm = _mm256_andnot_si256(half_mask, tm); + + t = _mm256_or_si256(t, tm); + t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_si256((__m256i*)dp, t); + } } } - // - // _mm256_cvtepi32_epi64 - // - - // ////////////////////////////////////////////////////////////////////////// - // void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - // si32 *y, si32 *cb, si32 *cr, ui32 repeat) - // { - // for (int i = (repeat + 7) >> 3; i > 0; --i) - // { - // __m256i mr = _mm256_load_si256((__m256i*)r); - // __m256i mg = _mm256_load_si256((__m256i*)g); - // __m256i mb = _mm256_load_si256((__m256i*)b); - // __m256i t = _mm256_add_epi32(mr, mb); - // t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1)); - // _mm256_store_si256((__m256i*)y, _mm256_srai_epi32(t, 2)); - // t = _mm256_sub_epi32(mb, mg); - // _mm256_store_si256((__m256i*)cb, t); - // t = _mm256_sub_epi32(mr, mg); - // _mm256_store_si256((__m256i*)cr, t); - - // r += 8; g += 8; b += 8; - // y += 8; cb += 8; cr += 8; - // } - // } + ////////////////////////////////////////////////////////////////////////// + void avx2_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) + { + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) + { + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i mr = _mm256_load_si256((__m256i*)rp); + __m256i mg = _mm256_load_si256((__m256i*)gp); + __m256i mb = _mm256_load_si256((__m256i*)bp); + __m256i t = _mm256_add_epi32(mr, mb); + t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1)); + _mm256_store_si256((__m256i*)yp, _mm256_srai_epi32(t, 2)); + t = _mm256_sub_epi32(mb, mg); + _mm256_store_si256((__m256i*)cbp, t); + t = _mm256_sub_epi32(mr, mg); + _mm256_store_si256((__m256i*)crp, t); + + rp += 8; gp += 8; bp += 8; + yp += 8; cbp += 8; crp += 8; + } + } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m256i zero = _mm256_setzero_si256(); + __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i mr32 = _mm256_load_si256((__m256i*)rp); + __m256i mg32 = _mm256_load_si256((__m256i*)gp); + __m256i mb32 = _mm256_load_si256((__m256i*)bp); + __m256i mr, mg, mb, t; + mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0)); + mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0)); + mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0)); + + t = _mm256_add_epi64(mr, mb); + t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1)); + _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2)); + t = _mm256_sub_epi64(mb, mg); + _mm256_store_si256((__m256i*)cbp, t); + t = _mm256_sub_epi64(mr, mg); + _mm256_store_si256((__m256i*)crp, t); + + yp += 4; cbp += 4; crp += 4; + + mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1)); + mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1)); + mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1)); + + t = _mm256_add_epi64(mr, mb); + t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1)); + _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2)); + t = _mm256_sub_epi64(mb, mg); + _mm256_store_si256((__m256i*)cbp, t); + t = _mm256_sub_epi64(mr, mg); + _mm256_store_si256((__m256i*)crp, t); + + rp += 8; gp += 8; bp += 8; + yp += 4; cbp += 4; crp += 4; + } + } + } ////////////////////////////////////////////////////////////////////////// void avx2_rct_backward(const line_buf *y, @@ -173,7 +379,8 @@ namespace ojph { (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); - __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, 0, (si64)ULLONG_MAX); + __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + 0, (si64)ULLONG_MAX); const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; for (int i = (repeat + 7) >> 3; i > 0; --i) @@ -210,12 +417,17 @@ namespace ojph { tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0)); tr = _mm256_andnot_si256(low_bits, tr); mr = _mm256_or_si256(mr, tr); + mr = _mm256_permute4x64_epi64(mr, _MM_SHUFFLE(3, 1, 2, 0)); + tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0)); tg = _mm256_andnot_si256(low_bits, tg); mg = _mm256_or_si256(mg, tg); + mg = _mm256_permute4x64_epi64(mg, _MM_SHUFFLE(3, 1, 2, 0)); + tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0)); tb = _mm256_andnot_si256(low_bits, tb); mb = _mm256_or_si256(mb, tb); + mb = _mm256_permute4x64_epi64(mb, _MM_SHUFFLE(3, 1, 2, 0)); _mm256_store_si256((__m256i*)rp, mr); _mm256_store_si256((__m256i*)gp, mg); @@ -227,29 +439,5 @@ namespace ojph { } } - - // ////////////////////////////////////////////////////////////////////////// - // void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - // si32 *r, si32 *g, si32 *b, ui32 repeat) - // { - // for (int i = (repeat + 7) >> 3; i > 0; --i) - // { - // __m256i my = _mm256_load_si256((__m256i*)y); - // __m256i mcb = _mm256_load_si256((__m256i*)cb); - // __m256i mcr = _mm256_load_si256((__m256i*)cr); - - // __m256i t = _mm256_add_epi32(mcb, mcr); - // t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2)); - // _mm256_store_si256((__m256i*)g, t); - // __m256i u = _mm256_add_epi32(mcb, t); - // _mm256_store_si256((__m256i*)b, u); - // u = _mm256_add_epi32(mcr, t); - // _mm256_store_si256((__m256i*)r, u); - - // y += 8; cb += 8; cr += 8; - // r += 8; g += 8; b += 8; - // } - // } - } } From 34e3fc3755cc285c27a5d4252f640a233d723ba2 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 8 Nov 2024 07:43:28 +1100 Subject: [PATCH 66/78] Removing two unnecessary lines. --- src/core/transform/ojph_colour_avx2.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 324a5a9c..05bff311 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -86,7 +86,6 @@ namespace ojph { { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; - __m256i zero = _mm256_setzero_si256(); __m256i sh = _mm256_set1_epi64x(shift); for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) { @@ -286,7 +285,6 @@ namespace ojph { (r->flags & line_buf::LFT_32BIT) && (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); - __m256i zero = _mm256_setzero_si256(); __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; From f37ae83be988d52fd25eae43e54aefcc28f4b1a5 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 8 Nov 2024 15:42:10 +1100 Subject: [PATCH 67/78] All wasm code has been done -- needs extensive tests. --- src/core/codestream/ojph_codeblock_fun.cpp | 31 +- src/core/codestream/ojph_codestream_wasm.cpp | 106 ++- src/core/codestream/ojph_params_local.h | 28 +- src/core/transform/ojph_colour.cpp | 6 +- src/core/transform/ojph_colour_local.h | 12 +- src/core/transform/ojph_colour_wasm.cpp | 370 ++++++++-- src/core/transform/ojph_transform_avx.cpp | 88 +-- src/core/transform/ojph_transform_local.h | 54 -- src/core/transform/ojph_transform_sse.cpp | 42 +- src/core/transform/ojph_transform_sse2.cpp | 120 ++-- src/core/transform/ojph_transform_wasm.cpp | 695 +++++++++++++++++-- 11 files changed, 1204 insertions(+), 348 deletions(-) diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index c0b70dc9..45504983 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -97,6 +97,8 @@ namespace ojph { float delta_inv, ui32 count, ui64* max_val); void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64* max_val); + void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); ////////////////////////////////////////////////////////////////////////// void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, @@ -122,6 +124,8 @@ namespace ojph { float delta, ui32 count); void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count); + void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); void codeblock_fun::init(bool reversible) { @@ -240,18 +244,31 @@ namespace ojph { #else // OJPH_ENABLE_WASM_SIMD // Accelerated functions for WASM SIMD. - decode_cb = ojph_decode_codeblock_wasm; - find_max_val = wasm_find_max_val; + decode_cb32 = ojph_decode_codeblock_wasm; + find_max_val32 = wasm_find_max_val32; mem_clear = wasm_mem_clear; if (reversible) { - tx_to_cb = wasm_rev_tx_to_cb; - tx_from_cb = wasm_rev_tx_from_cb; + tx_to_cb32 = wasm_rev_tx_to_cb32; + tx_from_cb32 = wasm_rev_tx_from_cb32; } else { - tx_to_cb = wasm_irv_tx_to_cb; - tx_from_cb = wasm_irv_tx_from_cb; + tx_to_cb32 = wasm_irv_tx_to_cb32; + tx_from_cb32 = wasm_irv_tx_from_cb32; + } + encode_cb32 = ojph_encode_codeblock32; + + decode_cb64 = ojph_decode_codeblock64; + find_max_val64 = wasm_find_max_val64; + if (reversible) { + tx_to_cb64 = wasm_rev_tx_to_cb64; + tx_from_cb64 = wasm_rev_tx_from_cb64; } - encode_cb = ojph_encode_codeblock; + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; + } + encode_cb64 = ojph_encode_codeblock64; #endif // !OJPH_ENABLE_WASM_SIMD diff --git a/src/core/codestream/ojph_codestream_wasm.cpp b/src/core/codestream/ojph_codestream_wasm.cpp index 19e47aa3..8dd76491 100644 --- a/src/core/codestream/ojph_codestream_wasm.cpp +++ b/src/core/codestream/ojph_codestream_wasm.cpp @@ -35,6 +35,7 @@ // Date: 15 May 2022 //***************************************************************************/ +#include #include #include @@ -43,20 +44,17 @@ namespace ojph { namespace local { - ////////////////////////////////////////////////////////////////////////// - #define REPEAT(a) a,a,a,a - ////////////////////////////////////////////////////////////////////////// void wasm_mem_clear(void* addr, size_t count) { float* p = (float*)addr; - v128_t zero = wasm_i32x4_const(REPEAT(0)); + v128_t zero = wasm_i32x4_splat(0); for (size_t i = 0; i < count; i += 16, p += 4) wasm_v128_store(p, zero); } ////////////////////////////////////////////////////////////////////////// - ui32 wasm_find_max_val(ui32* address) + ui32 wasm_find_max_val32(ui32* address) { v128_t x1, x0 = wasm_v128_load(address); x1 = wasm_i32x4_shuffle(x0, x0, 2, 3, 2, 3); // x1 = x0[2,3,2,3] @@ -68,16 +66,26 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + ui64 wasm_find_max_val64(ui64* address) + { + v128_t x1, x0 = wasm_v128_load(address); + x1 = wasm_i64x2_shuffle(x0, x0, 1, 1); // x1 = x0[2,3,2,3] + x0 = wasm_v128_or(x0, x1); + ui64 t = (ui64)wasm_i64x2_extract_lane(x0, 0); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(delta_inv); // convert to sign and magnitude and keep max_val ui32 shift = 31 - K_max; - v128_t m0 = wasm_i32x4_const(REPEAT((int)0x80000000)); - v128_t zero = wasm_i32x4_const(REPEAT(0)); - v128_t one = wasm_i32x4_const(REPEAT(1)); + v128_t m0 = wasm_i32x4_splat(INT_MIN); + v128_t zero = wasm_i32x4_splat(0); + v128_t one = wasm_i32x4_splat(1); v128_t tmax = wasm_v128_load(max_val); v128_t *p = (v128_t*)sp; for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4) @@ -97,16 +105,16 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(K_max); //quantize and convert to sign and magnitude and keep max_val v128_t d = wasm_f32x4_splat(delta_inv); - v128_t zero = wasm_i32x4_const(REPEAT(0)); - v128_t one = wasm_i32x4_const(REPEAT(1)); + v128_t zero = wasm_i32x4_splat(0); + v128_t one = wasm_i32x4_splat(1); v128_t tmax = wasm_v128_load(max_val); float *p = (float*)sp; for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4) @@ -127,14 +135,14 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; - v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF)); - v128_t zero = wasm_i32x4_const(REPEAT(0)); - v128_t one = wasm_i32x4_const(REPEAT(1)); + v128_t m1 = wasm_i32x4_splat(INT_MAX); + v128_t zero = wasm_i32x4_splat(0); + v128_t one = wasm_i32x4_splat(1); si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) { @@ -150,11 +158,11 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); - v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF)); + v128_t m1 = wasm_i32x4_splat(INT_MAX); v128_t d = wasm_f32x4_splat(delta); float *p = (float*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) @@ -167,6 +175,58 @@ namespace ojph { valf = wasm_v128_or(valf, sign); wasm_v128_store(p, valf); } - } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + v128_t m0 = wasm_i64x2_splat(LLONG_MIN); + v128_t zero = wasm_i64x2_splat(0); + v128_t one = wasm_i64x2_splat(1); + v128_t tmax = wasm_v128_load(max_val); + si64 *p = (si64*)sp; + for (ui32 i = 0; i < count; i += 2, p += 2, dp += 2) + { + v128_t v = wasm_v128_load((v128_t*)sp); + v128_t sign = wasm_i64x2_lt(v, zero); + v128_t val = wasm_v128_xor(v, sign); // negate 1's complement + v128_t ones = wasm_v128_and(sign, one); + val = wasm_i64x2_add(val, ones); // 2's complement + sign = wasm_v128_and(sign, m0); + val = wasm_i64x2_shl(val, shift); + tmax = wasm_v128_or(tmax, val); + val = wasm_v128_or(val, sign); + wasm_v128_store(dp, val); + } + wasm_v128_store(max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + v128_t m1 = wasm_i64x2_splat(LLONG_MAX); + v128_t zero = wasm_i64x2_splat(0); + v128_t one = wasm_i64x2_splat(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2) + { + v128_t v = wasm_v128_load((v128_t*)sp); + v128_t val = wasm_v128_and(v, m1); + val = wasm_i64x2_shr(val, shift); + v128_t sign = wasm_i64x2_lt(v, zero); + val = wasm_v128_xor(val, sign); // negate 1's complement + v128_t ones = wasm_v128_and(sign, one); + val = wasm_i64x2_add(val, ones); // 2's complement + wasm_v128_store(p, val); + } + } } } \ No newline at end of file diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 905e0306..55dbbde0 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -176,10 +176,16 @@ namespace ojph { public: param_siz() { - memset(this, 0, sizeof(param_siz)); + Lsiz = Csiz = 0; + Xsiz = Ysiz = XOsiz = YOsiz = XTsiz = YTsiz = XTOsiz = YTOsiz = 0; + skipped_resolutions = 0; + memset(store, 0, sizeof(store)); + ws_kern_support_needed = dfs_support_needed = false; + cod = NULL; + dfs = NULL; + Rsiz = RSIZ_HT_FLAG; cptr = store; old_Csiz = 4; - Rsiz = RSIZ_HT_FLAG; } ~param_siz() @@ -882,9 +888,10 @@ namespace ojph { }; public: // member functions - param_dfs() { memset(this, 0, sizeof(param_dfs)); } + param_dfs() { init(); } ~param_dfs() { if (next) delete next; } - void init() { memset(this, 0, sizeof(param_dfs)); } + void init() + { Ldfs = Sdfs = Ids = 0; memset(Ddfs, 0, sizeof(Ddfs)); next = NULL; } bool read(infile_base *file); bool exists() const { return Ldfs != 0; } @@ -959,8 +966,17 @@ namespace ojph { bool read_coefficient(infile_base *file, float &K); bool read_coefficient(infile_base *file, si16 &K); void init(bool clear_all = true) { - if (clear_all) - memset(this, 0, sizeof(param_atk)); + if (clear_all) + { + Latk = Satk = 0; + Katk = 0.0f; + Natk = 0; + d = NULL; + max_steps = 0; + memset(d_store, 0, sizeof(d_store)); + next = NULL; + alloced_next = false; + } d = d_store; max_steps = sizeof(d_store) / sizeof(lifting_step); } void init_irv97(); diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 54077f50..a98b477b 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -177,8 +177,9 @@ namespace ojph { #endif // !OJPH_DISABLE_SIMD #else // OJPH_ENABLE_WASM_SIMD - cnvrt_si32_to_si32_shftd = wasm_cnvrt_si32_to_si32_shftd; - cnvrt_si32_to_si32_nlt_type3 = wasm_cnvrt_si32_to_si32_nlt_type3; + + rev_convert = wasm_rev_convert; + rev_convert_nlt_type3 = wasm_rev_convert_nlt_type3; cnvrt_si32_to_float_shftd = wasm_cnvrt_si32_to_float_shftd; cnvrt_si32_to_float = wasm_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd; @@ -187,6 +188,7 @@ namespace ojph { rct_backward = wasm_rct_backward; ict_forward = wasm_ict_forward; ict_backward = wasm_ict_backward; + #endif // !OJPH_ENABLE_WASM_SIMD colour_transform_functions_initialized = true; diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index 71cf4541..5eb8b746 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -275,12 +275,16 @@ namespace ojph { ui32 width); ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void wasm_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width); + void wasm_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// void wasm_rct_forward( diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 57b84c7e..9628d556 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -39,12 +39,164 @@ #include #include "ojph_defs.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include "ojph_colour_local.h" namespace ojph { namespace local { + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i32x4_splat((si32)shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s = wasm_v128_load(sp); + s = wasm_i32x4_add(s, sh); + wasm_v128_store(dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s, t; + s = wasm_v128_load(sp); + + t = wasm_i64x2_extend_low_i32x4(s); + t = wasm_i64x2_add(t, sh); + wasm_v128_store(dp, t); + + t = wasm_i64x2_extend_high_i32x4(s); + t = wasm_i64x2_add(t, sh); + wasm_v128_store(dp + 1, t); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s0, s1; + s0 = wasm_v128_load(sp); + s0 = wasm_i64x2_add(s0, sh); + s1 = wasm_v128_load(sp + 1); + s1 = wasm_i64x2_add(s1, sh); + s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2); + wasm_v128_store(dp, s0); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i32x4_splat((si32)(-shift)); + v128_t zero = wasm_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + v128_t s = wasm_v128_load(sp); + v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value + v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + s = wasm_v128_andnot(c, s); // keep only +ve or 0 + s = wasm_v128_or(s, v_m_sh); // combine + wasm_v128_store(dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(-shift); + v128_t zero = wasm_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + v128_t s, u, c, v_m_sh; + s = wasm_v128_load(sp); + + u = wasm_i64x2_extend_low_i32x4(s); + c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value + v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + u = wasm_v128_andnot(c, u); // keep only +ve or 0 + u = wasm_v128_or(u, v_m_sh); // combine + + wasm_v128_store(dp, u); + + u = wasm_i64x2_extend_high_i32x4(s); + c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value + v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + u = wasm_v128_andnot(c, u); // keep only +ve or 0 + u = wasm_v128_or(u, v_m_sh); // combine + + wasm_v128_store(dp + 1, u); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(-shift); + v128_t zero = wasm_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + v128_t s, t0, t1, p, n, m, tm; + s = wasm_v128_load(sp); + m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value + tm = wasm_i64x2_sub(sh, s); // - shift - value + n = wasm_v128_and(m, tm); // -ve + p = wasm_v128_andnot(m, s); // +ve + t0 = wasm_v128_or(n, p); + + s = wasm_v128_load(sp + 1); + m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value + tm = wasm_i64x2_sub(sh, s); // - shift - value + n = wasm_v128_and(m, tm); // -ve + p = wasm_v128_andnot(m, s); // +ve + t1 = wasm_v128_or(n, p); + + t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2); + wasm_v128_store(dp, t0); + } + } + } + ////////////////////////////////////////////////////////////////////////// void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width) @@ -108,80 +260,182 @@ namespace ojph { } } - ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void wasm_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) { - v128_t sh = wasm_i32x4_splat(shift); - for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - v128_t s = wasm_v128_load(sp); - s = wasm_i32x4_add(s, sh); - wasm_v128_store(dp, s); - } - } + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; - ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, - int shift, ui32 width) - { - v128_t sh = wasm_i32x4_splat(-shift); - v128_t zero = wasm_i32x4_splat(0); - for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t mr = wasm_v128_load(rp); + v128_t mg = wasm_v128_load(gp); + v128_t mb = wasm_v128_load(bp); + v128_t t = wasm_i32x4_add(mr, mb); + t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1)); + wasm_v128_store(yp, wasm_i32x4_shr(t, 2)); + t = wasm_i32x4_sub(mb, mg); + wasm_v128_store(cbp, t); + t = wasm_i32x4_sub(mr, mg); + wasm_v128_store(crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 4; cbp += 4; crp += 4; + } + } + else { - v128_t s = wasm_v128_load(sp); - v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value - v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value - v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value - s = wasm_v128_andnot(c, s); // keep only +ve or 0 - s = wasm_v128_or(s, v_m_sh); // combine - wasm_v128_store(dp, s); + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t mr32 = wasm_v128_load(rp); + v128_t mg32 = wasm_v128_load(gp); + v128_t mb32 = wasm_v128_load(bp); + v128_t mr, mg, mb, t; + mr = wasm_i64x2_extend_low_i32x4(mr32); + mg = wasm_i64x2_extend_low_i32x4(mg32); + mb = wasm_i64x2_extend_low_i32x4(mb32); + + t = wasm_i64x2_add(mr, mb); + t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1)); + wasm_v128_store(yp, wasm_i64x2_shr(t, 2)); + t = wasm_i64x2_sub(mb, mg); + wasm_v128_store(cbp, t); + t = wasm_i64x2_sub(mr, mg); + wasm_v128_store(crp, t); + + yp += 2; cbp += 2; crp += 2; + + mr = wasm_i64x2_extend_high_i32x4(mr32); + mg = wasm_i64x2_extend_high_i32x4(mg32); + mb = wasm_i64x2_extend_high_i32x4(mb32); + + t = wasm_i64x2_add(mr, mb); + t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1)); + wasm_v128_store(yp, wasm_i64x2_shr(t, 2)); + t = wasm_i64x2_sub(mb, mg); + wasm_v128_store(cbp, t); + t = wasm_i64x2_sub(mr, mg); + wasm_v128_store(crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 2; cbp += 2; crp += 2; + } } } ////////////////////////////////////////////////////////////////////////// - void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void wasm_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) { - for (int i = (repeat + 3) >> 2; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - v128_t mr = wasm_v128_load(r); - v128_t mg = wasm_v128_load(g); - v128_t mb = wasm_v128_load(b); - v128_t t = wasm_i32x4_add(mr, mb); - t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1)); - wasm_v128_store(y, wasm_i32x4_shr(t, 2)); - t = wasm_i32x4_sub(mb, mg); - wasm_v128_store(cb, t); - t = wasm_i32x4_sub(mr, mg); - wasm_v128_store(cr, t); + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t my = wasm_v128_load(yp); + v128_t mcb = wasm_v128_load(cbp); + v128_t mcr = wasm_v128_load(crp); - r += 4; g += 4; b += 4; - y += 4; cb += 4; cr += 4; - } - } + v128_t t = wasm_i32x4_add(mcb, mcr); + t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2)); + wasm_v128_store(gp, t); + v128_t u = wasm_i32x4_add(mcb, t); + wasm_v128_store(bp, u); + u = wasm_i32x4_add(mcr, t); + wasm_v128_store(rp, u); - ////////////////////////////////////////////////////////////////////////// - void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) - { - for (int i = (repeat + 3) >> 2; i > 0; --i) + yp += 4; cbp += 4; crp += 4; + rp += 4; gp += 4; bp += 4; + } + } + else { - v128_t my = wasm_v128_load(y); - v128_t mcb = wasm_v128_load(cb); - v128_t mcr = wasm_v128_load(cr); + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1; + my = wasm_v128_load(yp); + mcb = wasm_v128_load(cbp); + mcr = wasm_v128_load(crp); - v128_t t = wasm_i32x4_add(mcb, mcr); - t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2)); - wasm_v128_store(g, t); - v128_t u = wasm_i32x4_add(mcb, t); - wasm_v128_store(b, u); - u = wasm_i32x4_add(mcr, t); - wasm_v128_store(r, u); + tg0 = wasm_i64x2_add(mcb, mcr); + tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2)); + tb0 = wasm_i64x2_add(mcb, tg0); + tr0 = wasm_i64x2_add(mcr, tg0); - y += 4; cb += 4; cr += 4; - r += 4; g += 4; b += 4; + yp += 2; cbp += 2; crp += 2; + + my = wasm_v128_load(yp); + mcb = wasm_v128_load(cbp); + mcr = wasm_v128_load(crp); + + tg1 = wasm_i64x2_add(mcb, mcr); + tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2)); + tb1 = wasm_i64x2_add(mcb, tg1); + tr1 = wasm_i64x2_add(mcr, tg1); + + tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2); + tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2); + tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2); + + wasm_v128_store(rp, tr0); + wasm_v128_store(gp, tg0); + wasm_v128_store(bp, tb0); + + yp += 2; cbp += 2; crp += 2; + rp += 4; gp += 4; bp += 4; + } } } diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp index 4e5b82e7..8838d18c 100644 --- a/src/core/transform/ojph_transform_avx.cpp +++ b/src/core/transform/ojph_transform_avx.cpp @@ -63,71 +63,35 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// static inline - void avx_deinterleave32(float* dpl, float* dph, float* sp, - int width, bool even) + void avx_deinterleave32(float* dpl, float* dph, float* sp, int width) { - if (even) + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) { - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) - { - __m256 a = _mm256_load_ps(sp); - __m256 b = _mm256_load_ps(sp + 8); - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); - _mm256_store_ps(dpl, e); - _mm256_store_ps(dph, f); - } - } - else - { - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) - { - __m256 a = _mm256_load_ps(sp); - __m256 b = _mm256_load_ps(sp + 8); - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); - _mm256_store_ps(dpl, f); - _mm256_store_ps(dph, e); - } + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); } } ////////////////////////////////////////////////////////////////////////// static inline - void avx_interleave32(float* dp, float* spl, float* sph, - int width, bool even) + void avx_interleave32(float* dp, float* spl, float* sph, int width) { - if (even) + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) { - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) - { - __m256 a = _mm256_load_ps(spl); - __m256 b = _mm256_load_ps(sph); - __m256 c = _mm256_unpacklo_ps(a, b); - __m256 d = _mm256_unpackhi_ps(a, b); - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); - _mm256_store_ps(dp, e); - _mm256_store_ps(dp + 8, f); - } - } - else - { - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) - { - __m256 a = _mm256_load_ps(spl); - __m256 b = _mm256_load_ps(sph); - __m256 c = _mm256_unpacklo_ps(b, a); - __m256 d = _mm256_unpackhi_ps(b, a); - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); - _mm256_store_ps(dp, e); - _mm256_store_ps(dp + 8, f); - } + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); } } @@ -170,11 +134,11 @@ namespace ojph { { // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; float* sp = src->f32; int w = (int)width; - avx_deinterleave32(dpl, dph, sp, w, even); + avx_deinterleave32(dpl, dph, sp, w); } // the actual horizontal transform @@ -305,10 +269,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - avx_interleave32(dp, spl, sph, w, even); + avx_interleave32(dp, spl, sph, w); } } else { diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h index 5406124c..acf9ee6d 100644 --- a/src/core/transform/ojph_transform_local.h +++ b/src/core/transform/ojph_transform_local.h @@ -107,60 +107,6 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - // Supporting macros - ////////////////////////////////////////////////////////////////////////// - - ////////////////////////////////////////////////////////////////////////// - #define SSE_DEINTERLEAVE32(dpl, dph, sp, width, even) \ - { \ - if (even) \ - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) \ - { \ - __m128 a = _mm_load_ps(sp); \ - __m128 b = _mm_load_ps(sp + 4); \ - __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm_store_ps(dpl, c); \ - _mm_store_ps(dph, d); \ - } \ - else \ - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) \ - { \ - __m128 a = _mm_load_ps(sp); \ - __m128 b = _mm_load_ps(sp + 4); \ - __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm_store_ps(dpl, d); \ - _mm_store_ps(dph, c); \ - } \ - } - - ////////////////////////////////////////////////////////////////////////// - #define SSE_INTERLEAVE32(dp, spl, sph, width, even) \ - { \ - if (even) \ - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) \ - { \ - __m128 a = _mm_load_ps(spl); \ - __m128 b = _mm_load_ps(sph); \ - __m128 c = _mm_unpacklo_ps(a, b); \ - __m128 d = _mm_unpackhi_ps(a, b); \ - _mm_store_ps(dp, c); \ - _mm_store_ps(dp + 4, d); \ - } \ - else \ - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) \ - { \ - __m128 a = _mm_load_ps(spl); \ - __m128 b = _mm_load_ps(sph); \ - __m128 c = _mm_unpacklo_ps(b, a); \ - __m128 d = _mm_unpackhi_ps(b, a); \ - _mm_store_ps(dp, c); \ - _mm_store_ps(dp + 4, d); \ - } \ - } - ////////////////////////////////////////////////////////////////////////// // Irreversible functions ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp index e878746d..dcb5e53e 100644 --- a/src/core/transform/ojph_transform_sse.cpp +++ b/src/core/transform/ojph_transform_sse.cpp @@ -50,6 +50,36 @@ namespace ojph { namespace local { + ////////////////////////////////////////////////////////////////////////// + static inline + void sse_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m128 a = _mm_load_ps(sp); + __m128 b = _mm_load_ps(sp + 4); + __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + _mm_store_ps(dpl, c); + _mm_store_ps(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse_interleave32(float* dp, float* spl, float* sph, int width) \ + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m128 a = _mm_load_ps(spl); + __m128 b = _mm_load_ps(sph); + __m128 c = _mm_unpacklo_ps(a, b); + __m128 d = _mm_unpackhi_ps(a, b); + _mm_store_ps(dp, c); + _mm_store_ps(dp + 4, d); + } + } + ////////////////////////////////////////////////////////////////////////// static inline void sse_multiply_const(float* p, float f, int width) { @@ -100,11 +130,11 @@ namespace ojph { { // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; float* sp = src->f32; int w = (int)width; - SSE_DEINTERLEAVE32(dpl, dph, sp, w, even); + sse_deinterleave32(dpl, dph, sp, w); } // the actual horizontal transform @@ -235,10 +265,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - SSE_INTERLEAVE32(dp, spl, sph, w, even); + sse_interleave32(dp, spl, sph, w); } } else { diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp index 1236b7cd..54a03005 100644 --- a/src/core/transform/ojph_transform_sse2.cpp +++ b/src/core/transform/ojph_transform_sse2.cpp @@ -64,58 +64,64 @@ namespace ojph { return result; } + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m128 a = _mm_load_ps(sp); + __m128 b = _mm_load_ps(sp + 4); + __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + _mm_store_ps(dpl, c); + _mm_store_ps(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_interleave32(float* dp, float* spl, float* sph, int width) \ + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m128 a = _mm_load_ps(spl); + __m128 b = _mm_load_ps(sph); + __m128 c = _mm_unpacklo_ps(a, b); + __m128 d = _mm_unpackhi_ps(a, b); + _mm_store_ps(dp, c); + _mm_store_ps(dp + 4, d); + } + } + ////////////////////////////////////////////////////////////////////////// static inline - void sse2_deinterleave64(double* dpl, double* dph, double* sp, - int width, bool even) + void sse2_deinterleave64(double* dpl, double* dph, double* sp, int width) { - if (even) - for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) - { - __m128d a = _mm_load_pd(sp); - __m128d b = _mm_load_pd(sp + 2); - __m128d c = _mm_shuffle_pd(a, b, 0); - __m128d d = _mm_shuffle_pd(a, b, 3); - _mm_store_pd(dpl, c); - _mm_store_pd(dph, d); - } - else - for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) - { - __m128d a = _mm_load_pd(sp); - __m128d b = _mm_load_pd(sp + 2); - __m128d c = _mm_shuffle_pd(a, b, 0); - __m128d d = _mm_shuffle_pd(a, b, 3); - _mm_store_pd(dpl, d); - _mm_store_pd(dph, c); - } + for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) + { + __m128d a = _mm_load_pd(sp); + __m128d b = _mm_load_pd(sp + 2); + __m128d c = _mm_shuffle_pd(a, b, 0); + __m128d d = _mm_shuffle_pd(a, b, 3); + _mm_store_pd(dpl, c); + _mm_store_pd(dph, d); + } } ////////////////////////////////////////////////////////////////////////// static inline - void sse2_interleave64(double* dp, double* spl, double* sph, - int width, bool even) + void sse2_interleave64(double* dp, double* spl, double* sph, int width) { - if (even) - for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) - { - __m128d a = _mm_load_pd(spl); - __m128d b = _mm_load_pd(sph); - __m128d c = _mm_unpacklo_pd(a, b); - __m128d d = _mm_unpackhi_pd(a, b); - _mm_store_pd(dp, c); - _mm_store_pd(dp + 2, d); - } - else - for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) - { - __m128d a = _mm_load_pd(spl); - __m128d b = _mm_load_pd(sph); - __m128d c = _mm_unpacklo_pd(b, a); - __m128d d = _mm_unpackhi_pd(b, a); - _mm_store_pd(dp, c); - _mm_store_pd(dp + 2, d); - } + for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) + { + __m128d a = _mm_load_pd(spl); + __m128d b = _mm_load_pd(sph); + __m128d c = _mm_unpacklo_pd(a, b); + __m128d d = _mm_unpackhi_pd(a, b); + _mm_store_pd(dp, c); + _mm_store_pd(dp + 2, d); + } } ///////////////////////////////////////////////////////////////////////// @@ -360,7 +366,7 @@ namespace ojph { (aug == NULL || aug->flags & line_buf::LFT_64BIT)); sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis); } - } + } ///////////////////////////////////////////////////////////////////////// static @@ -372,11 +378,11 @@ namespace ojph { { // combine both lsrc and hsrc into dst { - float* dpl = ldst->f32; - float* dph = hdst->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; float* sp = src->f32; int w = (int)width; - SSE_DEINTERLEAVE32(dpl, dph, sp, w, even); + sse2_deinterleave32(dpl, dph, sp, w); } si32* hp = hdst->i32, * lp = ldst->i32; @@ -519,11 +525,11 @@ namespace ojph { { // combine both lsrc and hsrc into dst { - double* dpl = (double*)ldst->p; - double* dph = (double*)hdst->p; + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); double* sp = (double*)src->p; int w = (int)width; - sse2_deinterleave64(dpl, dph, sp, w, even); + sse2_deinterleave64(dpl, dph, sp, w); } si64* hp = hdst->i64, * lp = ldst->i64; @@ -811,10 +817,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - SSE_INTERLEAVE32(dp, spl, sph, w, even); + sse2_interleave32(dp, spl, sph, w); } } else { @@ -958,10 +964,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { double* dp = (double*)dst->p; - double* spl = (double*)lsrc->p; - double* sph = (double*)hsrc->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); int w = (int)width; - sse2_interleave64(dp, spl, sph, w, even); + sse2_interleave64(dp, spl, sph, w); } } else { diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp index bd652dfa..e3085594 100644 --- a/src/core/transform/ojph_transform_wasm.cpp +++ b/src/core/transform/ojph_transform_wasm.cpp @@ -51,65 +51,69 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void wasm_deinterleave(float* dpl, float* dph, float* sp, - int width, bool even) + static inline + void wasm_deinterleave32(float* dpl, float* dph, float* sp, int width) { - if (even) - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) - { - v128_t a = wasm_v128_load(sp); - v128_t b = wasm_v128_load(sp + 4); - v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); - v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); - // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); - // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); - wasm_v128_store(dpl, c); - wasm_v128_store(dph, d); - } - else - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) - { - v128_t a = wasm_v128_load(sp); - v128_t b = wasm_v128_load(sp + 4); - v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); - v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); - // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); - // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); - wasm_v128_store(dpl, d); - wasm_v128_store(dph, c); - } + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + v128_t a = wasm_v128_load(sp); + v128_t b = wasm_v128_load(sp + 4); + v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); + v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); + // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + wasm_v128_store(dpl, c); + wasm_v128_store(dph, d); + } } ////////////////////////////////////////////////////////////////////////// - void wasm_interleave(float* dp, float* spl, float* sph, - int width, bool even) + static inline + void wasm_interleave32(float* dp, float* spl, float* sph, int width) { - if (even) - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) - { - v128_t a = wasm_v128_load(spl); - v128_t b = wasm_v128_load(sph); - v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1); - v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3); - // v128_t c = _mm_unpacklo_ps(a, b); - // v128_t d = _mm_unpackhi_ps(a, b); - wasm_v128_store(dp, c); - wasm_v128_store(dp + 4, d); - } - else - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) - { - v128_t a = wasm_v128_load(spl); - v128_t b = wasm_v128_load(sph); - v128_t c = wasm_i32x4_shuffle(b, a, 0, 4 + 0, 1, 4 + 1); - v128_t d = wasm_i32x4_shuffle(b, a, 2, 4 + 2, 3, 4 + 3); - // v128_t c = _mm_unpacklo_ps(b, a); - // v128_t d = _mm_unpackhi_ps(b, a); - wasm_v128_store(dp, c); - wasm_v128_store(dp + 4, d); - } + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + v128_t a = wasm_v128_load(spl); + v128_t b = wasm_v128_load(sph); + v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1); + v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3); + // v128_t c = _mm_unpacklo_ps(a, b); + // v128_t d = _mm_unpackhi_ps(a, b); + wasm_v128_store(dp, c); + wasm_v128_store(dp + 4, d); + } } + ////////////////////////////////////////////////////////////////////////// + static inline + void wasm_deinterleave64(double* dpl, double* dph, double* sp, int width) + { + for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) + { + v128_t a = wasm_v128_load(sp); + v128_t b = wasm_v128_load(sp + 2); + v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0); + v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1); + wasm_v128_store(dpl, c); + wasm_v128_store(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void wasm_interleave64(double* dp, double* spl, double* sph, int width) + { + for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) + { + v128_t a = wasm_v128_load(spl); + v128_t b = wasm_v128_load(sph); + v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0); + v128_t d = wasm_i64x2_shuffle(a, b, 2, 2 + 2); + wasm_v128_store(dp, c); + wasm_v128_store(dp + 2, d); + } + } + ////////////////////////////////////////////////////////////////////////// static inline void wasm_multiply_const(float* p, float f, int width) { @@ -159,7 +163,13 @@ namespace ojph { if (width > 1) { // split src into ldst and hdst - wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + wasm_deinterleave32(dpl, dph, sp, w); + } // the actual horizontal transform float* hp = hdst->f32, * lp = ldst->f32; @@ -287,7 +297,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + wasm_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -298,13 +314,13 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + void wasm_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; v128_t va = wasm_i32x4_splat(a); v128_t vb = wasm_i32x4_splat(b); @@ -428,14 +444,174 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + void wasm_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = wasm_i64x2_splat(a); + v128_t vb = wasm_i64x2_splat(b); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + else + { // general case + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + } + + ///////////////////////////////////////////////////////////////////////// + void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + wasm_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + wasm_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void wasm_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { // combine both lsrc and hsrc into dst - wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + wasm_deinterleave32(dpl, dph, sp, w); + } si32* hp = hdst->i32, * lp = ldst->i32; ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass @@ -447,7 +623,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; v128_t va = wasm_i32x4_splat(a); v128_t vb = wasm_i32x4_splat(b); @@ -587,11 +763,199 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } - - ////////////////////////////////////////////////////////////////////////// - void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + + ///////////////////////////////////////////////////////////////////////// + static + void wasm_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)src->p; + int w = (int)width; + wasm_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = wasm_i64x2_splat(a); + v128_t vb = wasm_i64x2_splat(b); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + wasm_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + wasm_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -605,7 +969,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; v128_t va = wasm_i32x4_splat(a); v128_t vb = wasm_i32x4_splat(b); @@ -739,7 +1103,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + wasm_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -749,5 +1119,192 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = wasm_i64x2_splat(a); + v128_t vb = wasm_i64x2_splat(b); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + wasm_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + wasm_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + wasm_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph From 1383f9bfe3a0819f305af63c8fca44ba7709ed5a Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Fri, 8 Nov 2024 22:47:43 +1100 Subject: [PATCH 68/78] All wasm code was tested except for NLT. --- src/core/codestream/ojph_codestream_wasm.cpp | 8 ++++---- src/core/common/ojph_arch.h | 12 +++++++++--- src/core/transform/ojph_colour_wasm.cpp | 8 ++++---- src/core/transform/ojph_transform_wasm.cpp | 2 +- tests/CMakeLists.txt | 2 +- tests/test_executables.cpp | 6 ++++++ 6 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/core/codestream/ojph_codestream_wasm.cpp b/src/core/codestream/ojph_codestream_wasm.cpp index 8dd76491..e2cd444b 100644 --- a/src/core/codestream/ojph_codestream_wasm.cpp +++ b/src/core/codestream/ojph_codestream_wasm.cpp @@ -87,8 +87,8 @@ namespace ojph { v128_t zero = wasm_i32x4_splat(0); v128_t one = wasm_i32x4_splat(1); v128_t tmax = wasm_v128_load(max_val); - v128_t *p = (v128_t*)sp; - for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4) + si32 *p = (si32*)sp; + for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4) { v128_t v = wasm_v128_load(p); v128_t sign = wasm_i32x4_lt(v, zero); @@ -192,7 +192,7 @@ namespace ojph { si64 *p = (si64*)sp; for (ui32 i = 0; i < count; i += 2, p += 2, dp += 2) { - v128_t v = wasm_v128_load((v128_t*)sp); + v128_t v = wasm_v128_load(p); v128_t sign = wasm_i64x2_lt(v, zero); v128_t val = wasm_v128_xor(v, sign); // negate 1's complement v128_t ones = wasm_v128_and(sign, one); @@ -204,7 +204,7 @@ namespace ojph { wasm_v128_store(dp, val); } wasm_v128_store(max_val, tmax); - } + } ////////////////////////////////////////////////////////////////////////// void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h index 8292a686..29ab7a57 100644 --- a/src/core/common/ojph_arch.h +++ b/src/core/common/ojph_arch.h @@ -286,9 +286,15 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// // constants //////////////////////////////////////////////////////////////////////////// - const ui32 byte_alignment = 64; // 64 bytes == 512 bits - const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment); - const ui32 object_alignment = 8; + #ifndef OJPH_EMSCRIPTEN + const ui32 byte_alignment = 64; // 64 bytes == 512 bits + const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment); + const ui32 object_alignment = 8; + #else + const ui32 byte_alignment = 16; // 16 bytes == 128 bits + const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment); + const ui32 object_alignment = 8; + #endif //////////////////////////////////////////////////////////////////////////// // templates for alignment diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 9628d556..5bf6ccdd 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -83,7 +83,7 @@ namespace ojph { t = wasm_i64x2_extend_high_i32x4(s); t = wasm_i64x2_add(t, sh); - wasm_v128_store(dp + 1, t); + wasm_v128_store(dp + 2, t); } } } @@ -99,7 +99,7 @@ namespace ojph { v128_t s0, s1; s0 = wasm_v128_load(sp); s0 = wasm_i64x2_add(s0, sh); - s1 = wasm_v128_load(sp + 1); + s1 = wasm_v128_load(sp + 2); s1 = wasm_i64x2_add(s1, sh); s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2); wasm_v128_store(dp, s0); @@ -160,7 +160,7 @@ namespace ojph { u = wasm_v128_andnot(c, u); // keep only +ve or 0 u = wasm_v128_or(u, v_m_sh); // combine - wasm_v128_store(dp + 1, u); + wasm_v128_store(dp + 2, u); } } } @@ -184,7 +184,7 @@ namespace ojph { p = wasm_v128_andnot(m, s); // +ve t0 = wasm_v128_or(n, p); - s = wasm_v128_load(sp + 1); + s = wasm_v128_load(sp + 2); m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value tm = wasm_i64x2_sub(sh, s); // - shift - value n = wasm_v128_and(m, tm); // -ve diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp index e3085594..341cfc32 100644 --- a/src/core/transform/ojph_transform_wasm.cpp +++ b/src/core/transform/ojph_transform_wasm.cpp @@ -108,7 +108,7 @@ namespace ojph { v128_t a = wasm_v128_load(spl); v128_t b = wasm_v128_load(sph); v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0); - v128_t d = wasm_i64x2_shuffle(a, b, 2, 2 + 2); + v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1); wasm_v128_store(dp, c); wasm_v128_store(dp + 2, d); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 000409ff..8cc1d723 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -3,7 +3,7 @@ include(FetchContent) FetchContent_Declare( googletest - URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz + URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz EXCLUDE_FROM_ALL ) # For Windows: Prevent overriding the parent project's compiler/linker settings diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp index 9f77f75e..600ae532 100644 --- a/tests/test_executables.cpp +++ b/tests/test_executables.cpp @@ -109,6 +109,12 @@ int execute(const std::string& cmd, std::string& result) #define COMPARE_FILES_PATH "./compare_files" #define EXPAND_EXECUTABLE "./ojph_expand" #define COMPRESS_EXECUTABLE "./ojph_compress" +//#define EXPAND_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_expand.js" +//#define COMPRESS_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_compress.js" +//#define EXPAND_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_expand_simd.js" +//#define COMPRESS_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_compress_simd.js" +//#define EXPAND_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_expand" +//#define COMPRESS_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_compress" #endif #define TOL_DOUBLE 0.01 #define TOL_INTEGER 1 From e86d6c89dc2515ccb22d2b5c758e150296518afd Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 9 Nov 2024 08:10:15 +1100 Subject: [PATCH 69/78] This completes the AVX512 code. --- src/core/transform/ojph_transform_avx2.cpp | 88 +- src/core/transform/ojph_transform_avx512.cpp | 817 ++++++++++++++++--- src/core/transform/ojph_transform_sse2.cpp | 4 +- 3 files changed, 716 insertions(+), 193 deletions(-) diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp index cb7cd61e..1bc92e60 100644 --- a/src/core/transform/ojph_transform_avx2.cpp +++ b/src/core/transform/ojph_transform_avx2.cpp @@ -371,34 +371,6 @@ namespace ojph { else for (ui32 i = repeat; i > 0; --i) *dst++ += (b + a * (*src1++ + *src2++)) >> e; - - // int i = (int)repeat; - // if (synthesis) - // for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) - // { - // __m256i s1 = _mm256_load_si256((__m256i*)src1); - // __m256i s2 = _mm256_load_si256((__m256i*)src2); - // __m256i d = _mm256_load_si256((__m256i*)dst); - // __m256i t = _mm256_add_epi64(s1, s2); - // __m256i u = _mm256_mullo_epi64(va, t); - // __m256i v = _mm256_add_epi64(vb, u); - // __m256i w = avx2_mm256_srai_epi64(v, e, ve); - // d = _mm256_sub_epi64(d, w); - // _mm256_store_si256((__m256i*)dst, d); - // } - // else - // for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) - // { - // __m256i s1 = _mm256_load_si256((__m256i*)src1); - // __m256i s2 = _mm256_load_si256((__m256i*)src2); - // __m256i d = _mm256_load_si256((__m256i*)dst); - // __m256i t = _mm256_add_epi64(s1, s2); - // __m256i u = _mm256_mullo_epi64(va, t); - // __m256i v = _mm256_add_epi64(vb, u); - // __m256i w = avx2_mm256_srai_epi64(v, e, ve); - // d = _mm256_add_epi64(d, w); - // _mm256_store_si256((__m256i*)dst, d); - // } } } @@ -433,7 +405,7 @@ namespace ojph { { if (width > 1) { - // combine both lsrc and hsrc into dst + // split src into ldst and hdst { float* dpl = even ? ldst->f32 : hdst->f32; float* dph = even ? hdst->f32 : ldst->f32; @@ -601,7 +573,7 @@ namespace ojph { { if (width > 1) { - // combine both lsrc and hsrc into dst + // split src into ldst and hdst { double* dpl = (double*)(even ? ldst->p : hdst->p); double* dph = (double*)(even ? hdst->p : ldst->p); @@ -726,34 +698,6 @@ namespace ojph { else for (ui32 i = h_width; i > 0; --i, sp++, dp++) *dp += (b + a * (sp[-1] + sp[0])) >> e; - - // int i = (int)h_width; - // if (even) - // for (; i > 0; i -= 4, sp += 4, dp += 4) - // { - // __m256i s1 = _mm256_load_si256((__m256i*)sp); - // __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); - // __m256i d = _mm256_load_si256((__m256i*)dp); - // __m256i t = _mm256_add_epi64(s1, s2); - // __m256i u = _mm256_mullo_epi64(va, t); - // __m256i v = _mm256_add_epi64(vb, u); - // __m256i w = avx2_mm256_srai_epi64(v, e, ve); - // d = _mm256_add_epi64(d, w); - // _mm256_store_si256((__m256i*)dp, d); - // } - // else - // for (; i > 0; i -= 4, sp += 4, dp += 4) - // { - // __m256i s1 = _mm256_load_si256((__m256i*)sp); - // __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); - // __m256i d = _mm256_load_si256((__m256i*)dp); - // __m256i t = _mm256_add_epi64(s1, s2); - // __m256i u = _mm256_mullo_epi64(va, t); - // __m256i v = _mm256_add_epi64(vb, u); - // __m256i w = avx2_mm256_srai_epi64(v, e, ve); - // d = _mm256_add_epi64(d, w); - // _mm256_store_si256((__m256i*)dp, d); - // } } // swap buffers @@ -1082,34 +1026,6 @@ namespace ojph { else for (ui32 i = aug_width; i > 0; --i, sp++, dp++) *dp -= (b + a * (sp[0] + sp[1])) >> e; - - // int i = (int)aug_width; - // if (ev) - // for (; i > 0; i -= 4, sp += 4, dp += 4) - // { - // __m256i s1 = _mm256_load_si256((__m256i*)sp); - // __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); - // __m256i d = _mm256_load_si256((__m256i*)dp); - // __m256i t = _mm256_add_epi64(s1, s2); - // __m256i u = _mm256_mullo_epi64(va, t); - // __m256i v = _mm256_add_epi64(vb, u); - // __m256i w = avx2_mm256_srai_epi64(v, e, ve); - // d = _mm256_sub_epi64(d, w); - // _mm256_store_si256((__m256i*)dp, d); - // } - // else - // for (; i > 0; i -= 4, sp += 4, dp += 4) - // { - // __m256i s1 = _mm256_load_si256((__m256i*)sp); - // __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); - // __m256i d = _mm256_load_si256((__m256i*)dp); - // __m256i t = _mm256_add_epi64(s1, s2); - // __m256i u = _mm256_mullo_epi64(va, t); - // __m256i v = _mm256_add_epi64(vb, u); - // __m256i w = avx2_mm256_srai_epi64(v, e, ve); - // d = _mm256_sub_epi64(d, w); - // _mm256_store_si256((__m256i*)dp, d); - // } } // swap buffers diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp index 504aa870..0e922307 100644 --- a/src/core/transform/ojph_transform_avx512.cpp +++ b/src/core/transform/ojph_transform_avx512.cpp @@ -54,8 +54,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// // We split multiples of 32 followed by multiples of 16, because // we assume byte_alignment == 64 - static void avx512_deinterleave(float* dpl, float* dph, float* sp, - int width, bool even) + static + void avx512_deinterleave32(float* dpl, float* dph, float* sp, int width) { __m512i idx1 = _mm512_set_epi32( 0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10, @@ -65,59 +65,33 @@ namespace ojph { 0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11, 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01 ); - if (even) + for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16) { - for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16) - { - __m512 a = _mm512_load_ps(sp); - __m512 b = _mm512_load_ps(sp + 16); - __m512 c = _mm512_permutex2var_ps(a, idx1, b); - __m512 d = _mm512_permutex2var_ps(a, idx2, b); - _mm512_store_ps(dpl, c); - _mm512_store_ps(dph, d); - } - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) - { - __m256 a = _mm256_load_ps(sp); - __m256 b = _mm256_load_ps(sp + 8); - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); - _mm256_store_ps(dpl, e); - _mm256_store_ps(dph, f); - } + __m512 a = _mm512_load_ps(sp); + __m512 b = _mm512_load_ps(sp + 16); + __m512 c = _mm512_permutex2var_ps(a, idx1, b); + __m512 d = _mm512_permutex2var_ps(a, idx2, b); + _mm512_store_ps(dpl, c); + _mm512_store_ps(dph, d); } - else + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) { - for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16) - { - __m512 a = _mm512_load_ps(sp); - __m512 b = _mm512_load_ps(sp + 16); - __m512 c = _mm512_permutex2var_ps(a, idx2, b); - __m512 d = _mm512_permutex2var_ps(a, idx1, b); - _mm512_store_ps(dpl, c); - _mm512_store_ps(dph, d); - } - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) - { - __m256 a = _mm256_load_ps(sp); - __m256 b = _mm256_load_ps(sp + 8); - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); - _mm256_store_ps(dpl, f); - _mm256_store_ps(dph, e); - } + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); } } ////////////////////////////////////////////////////////////////////////// // We split multiples of 32 followed by multiples of 16, because // we assume byte_alignment == 64 - static void avx512_interleave(float* dp, float* spl, float* sph, - int width, bool even) + static + void avx512_interleave32(float* dp, float* spl, float* sph, int width) { __m512i idx1 = _mm512_set_epi32( 0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4, @@ -127,51 +101,93 @@ namespace ojph { 0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC, 0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8 ); - if (even) + for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16) { - for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16) - { - __m512 a = _mm512_load_ps(spl); - __m512 b = _mm512_load_ps(sph); - __m512 c = _mm512_permutex2var_ps(a, idx1, b); - __m512 d = _mm512_permutex2var_ps(a, idx2, b); - _mm512_store_ps(dp, c); - _mm512_store_ps(dp + 16, d); - } - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) - { - __m256 a = _mm256_load_ps(spl); - __m256 b = _mm256_load_ps(sph); - __m256 c = _mm256_unpacklo_ps(a, b); - __m256 d = _mm256_unpackhi_ps(a, b); - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); - _mm256_store_ps(dp, e); - _mm256_store_ps(dp + 8, f); - } + __m512 a = _mm512_load_ps(spl); + __m512 b = _mm512_load_ps(sph); + __m512 c = _mm512_permutex2var_ps(a, idx1, b); + __m512 d = _mm512_permutex2var_ps(a, idx2, b); + _mm512_store_ps(dp, c); + _mm512_store_ps(dp + 16, d); } - else + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) { - for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16) - { - __m512 a = _mm512_load_ps(spl); - __m512 b = _mm512_load_ps(sph); - __m512 c = _mm512_permutex2var_ps(b, idx1, a); - __m512 d = _mm512_permutex2var_ps(b, idx2, a); - _mm512_store_ps(dp, c); - _mm512_store_ps(dp + 16, d); - } - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) - { - __m256 a = _mm256_load_ps(spl); - __m256 b = _mm256_load_ps(sph); - __m256 c = _mm256_unpacklo_ps(b, a); - __m256 d = _mm256_unpackhi_ps(b, a); - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); - _mm256_store_ps(dp, e); - _mm256_store_ps(dp + 8, f); - } + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + // We split multiples of 32 followed by multiples of 16, because + // we assume byte_alignment == 64 + static void avx512_deinterleave64(double* dpl, double* dph, double* sp, + int width) + { + __m512i idx1 = _mm512_set_epi64( + 0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00 + ); + __m512i idx2 = _mm512_set_epi64( + 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01 + ); + for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m512d a = _mm512_load_pd(sp); + __m512d b = _mm512_load_pd(sp + 16); + __m512d c = _mm512_permutex2var_pd(a, idx1, b); + __m512d d = _mm512_permutex2var_pd(a, idx2, b); + _mm512_store_pd(dpl, c); + _mm512_store_pd(dph, d); + } + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m256d a = _mm256_load_pd(sp); + __m256d b = _mm256_load_pd(sp + 4); + __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0)); + __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1)); + __m256d e = _mm256_shuffle_pd(c, d, 0x0); + __m256d f = _mm256_shuffle_pd(c, d, 0xF); + _mm256_store_pd(dpl, e); + _mm256_store_pd(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + // We split multiples of 32 followed by multiples of 16, because + // we assume byte_alignment == 64 + static void avx512_interleave64(double* dp, double* spl, double* sph, + int width) + { + __m512i idx1 = _mm512_set_epi64( + 0xB, 0x3, 0xA, 0x2, 0x9, 0x1, 0x8, 0x0 + ); + __m512i idx2 = _mm512_set_epi64( + 0xF, 0x7, 0xE, 0x6, 0xD, 0x5, 0xC, 0x4 + ); + for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m512d a = _mm512_load_pd(spl); + __m512d b = _mm512_load_pd(sph); + __m512d c = _mm512_permutex2var_pd(a, idx1, b); + __m512d d = _mm512_permutex2var_pd(a, idx2, b); + _mm512_store_pd(dp, c); + _mm512_store_pd(dp + 16, d); + } + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m256d a = _mm256_load_pd(spl); + __m256d b = _mm256_load_pd(sph); + __m256d c = _mm256_unpacklo_pd(a, b); + __m256d d = _mm256_unpackhi_pd(a, b); + __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0)); + __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1)); + _mm256_store_pd(dp, e); + _mm256_store_pd(dp + 4, f); } } @@ -224,7 +240,13 @@ namespace ojph { if (width > 1) { // split src into ldst and hdst - avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + avx512_deinterleave32(dpl, dph, sp, w); + } // the actual horizontal transform float* hp = hdst->f32, * lp = ldst->f32; @@ -352,7 +374,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + avx512_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -364,13 +392,13 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// - void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + void avx512_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m512i va = _mm512_set1_epi32(a); __m512i vb = _mm512_set1_epi32(b); @@ -493,14 +521,185 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + void avx512_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m512i vb = _mm512_set1_epi64(b); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + else + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + else + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + else + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + } + else { + // general case + // 64bit multiplication is not supported in AVX512F + AVX512CD; + // in particular, _mm256_mullo_epi64. + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + + // This can only be used if you have AVX512DQ + // { // general case + // __m512i va = _mm512_set1_epi64(a); + // int i = (int)repeat; + // if (synthesis) + // for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)src1); + // __m512i s2 = _mm512_load_si512((__m512i*)src2); + // __m512i d = _mm512_load_si512((__m512i*)dst); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_sub_epi64(d, w); + // _mm512_store_si512((__m512i*)dst, d); + // } + // else + // for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)src1); + // __m512i s2 = _mm512_load_si512((__m512i*)src2); + // __m512i d = _mm512_load_si512((__m512i*)dst); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_add_epi64(d, w); + // _mm512_store_si512((__m512i*)dst, d); + // } + // } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + avx512_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + avx512_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { - // combine both lsrc and hsrc into dst - avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + // split src into ldst and hdst + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + avx512_deinterleave32(dpl, dph, sp, w); + } si32* hp = hdst->i32, * lp = ldst->i32; ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass @@ -512,7 +711,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m512i va = _mm512_set1_epi32(a); __m512i vb = _mm512_set1_epi32(b); @@ -653,10 +852,211 @@ namespace ojph { } } - ////////////////////////////////////////////////////////////////////////// - void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // split src into ldst and hdst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)(src->p); + int w = (int)width; + avx512_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m512i vb = _mm512_set1_epi64(b); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + // general case + // 64bit multiplication is not supported in AVX512F + AVX512CD; + // in particular, _mm256_mullo_epi64. + if (even) + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[0] + sp[1])) >> e; + else + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // This can only be used if you have AVX512DQ + // { + // // general case + // __m512i va = _mm512_set1_epi64(a); + // int i = (int)h_width; + // if (even) + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_add_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // else + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_add_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + avx512_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + avx512_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -670,7 +1070,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m512i va = _mm512_set1_epi32(a); __m512i vb = _mm512_set1_epi32(b); @@ -804,7 +1204,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + avx512_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -814,5 +1220,206 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m512i vb = _mm512_set1_epi64(b); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + // general case + // 64bit multiplication is not supported in AVX512F + AVX512CD; + // in particular, _mm256_mullo_epi64. + if (ev) + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + else + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[0] + sp[1])) >> e; + } + + // This can only be used if you have AVX512DQ + // { + // // general case + // __m512i va = _mm512_set1_epi64(a); + // int i = (int)aug_width; + // if (ev) + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_sub_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // else + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_sub_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)(dst->p); + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + avx512_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + avx512_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + avx512_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp index 54a03005..a69b1fbe 100644 --- a/src/core/transform/ojph_transform_sse2.cpp +++ b/src/core/transform/ojph_transform_sse2.cpp @@ -376,7 +376,7 @@ namespace ojph { { if (width > 1) { - // combine both lsrc and hsrc into dst + // split src into ldst and hdst { float* dpl = even ? ldst->f32 : hdst->f32; float* dph = even ? hdst->f32 : ldst->f32; @@ -523,7 +523,7 @@ namespace ojph { { if (width > 1) { - // combine both lsrc and hsrc into dst + // split src into ldst and hdst { double* dpl = (double*)(even ? ldst->p : hdst->p); double* dph = (double*)(even ? hdst->p : ldst->p); From e96d6da3a128546d35fe5411794d1861534d3ba5 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 9 Nov 2024 08:46:11 +1100 Subject: [PATCH 70/78] Added comment to test_executables regarding testing. --- tests/test_executables.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp index 600ae532..22f148e5 100644 --- a/tests/test_executables.cpp +++ b/tests/test_executables.cpp @@ -107,6 +107,19 @@ int execute(const std::string& cmd, std::string& result) #define REF_FILE_DIR "./jp2k_test_codestreams/openjph/references/" #define MSE_PAE_PATH "./mse_pae" #define COMPARE_FILES_PATH "./compare_files" + +// This is a comment to me, to help with emscripten testing. +// This is written after the completion of the tests. +// 1. Compile for the target platform (Linux), selecting from the following +// code the version that suits you; in particular it should be the one +// the uses node. Ideally create two versions of test_executables, one +// for WASM SIMD, and for WASM without SIMD -- use linux cp command to +// create test_executables_simd and test_executables_no_simd +// 2. Compile again, without deleting what compiled; this time compile using +// emscripten, targeting WASM. The compilation is very finicky, do +// 'make clean && make' after every change in code. +// 3. cd to tests, and run test_executables_simd or test_executables_no_simd. + #define EXPAND_EXECUTABLE "./ojph_expand" #define COMPRESS_EXECUTABLE "./ojph_compress" //#define EXPAND_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_expand.js" From 6386dafc71027d729c352b6ac2e64e339d93bb41 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 9 Nov 2024 12:02:06 +1100 Subject: [PATCH 71/78] Fixed the SPqcd issue. --- src/core/codestream/ojph_params.cpp | 40 ++++++++++++++++++++----- src/core/codestream/ojph_params_local.h | 17 ++--------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index 04e52a63..8a234e59 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -786,7 +786,10 @@ namespace ojph { ui32 bit_depth = 32; if (reversible) { bit_depth = siz->get_bit_depth(comp_num); - bit_depth += employing_color_transform + get_num_decompositions(); + bit_depth += comp_num < 3 ? employing_color_transform : 0; + // 3 or 4 is how many extra bits are needed for the HH band at the + // bottom most level of decomposition. + bit_depth += get_num_decompositions() > 5 ? 4 : 3; } return bit_depth; @@ -945,23 +948,46 @@ namespace ojph { void param_qcd::set_rev_quant(ui32 num_decomps, ui32 bit_depth, bool is_employing_color_transform) { - int guard_bits = 1; - Sqcd = (ui8)(guard_bits << 5); //one guard bit, and no quantization ui32 B = bit_depth; B += is_employing_color_transform ? 1 : 0; //1 bit for RCT int s = 0; double bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true); ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2); - u8_SPqcd[s++] = encode_SPqcd((ui8)(B + X)); + u8_SPqcd[s++] = (ui8)(B + X); + ui32 max_B_plus_X = (ui32)(B + X); for (ui32 d = num_decomps; d > 0; --d) { double bibo_l = bibo_gains::get_bibo_gain_l(d, true); double bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true); X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2); - u8_SPqcd[s++] = encode_SPqcd((ui8)(B + X)); - u8_SPqcd[s++] = encode_SPqcd((ui8)(B + X)); + u8_SPqcd[s++] = (ui8)(B + X); + max_B_plus_X = ojph_max(max_B_plus_X, B + X); + u8_SPqcd[s++] = (ui8)(B + X); + max_B_plus_X = ojph_max(max_B_plus_X, B + X); X = (ui32) ceil(log(bibo_h * bibo_h) / M_LN2); - u8_SPqcd[s++] = encode_SPqcd((ui8)(B + X)); + u8_SPqcd[s++] = (ui8)(B + X); + max_B_plus_X = ojph_max(max_B_plus_X, B + X); + } + + if (max_B_plus_X > 38) + OJPH_ERROR(0x00050151, "The specified combination of bit_depth, " + "colour transform, and type of wavelet transform requires more than " + "38 bits; it requires %d bits. This is beyond what is allowed in " + "the JPEG2000 image coding format.", max_B_plus_X); + + int guard_bits = ojph_max(1, (si32)max_B_plus_X - 31); + Sqcd = (ui8)(guard_bits << 5); + s = 0; + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; + for (ui32 d = num_decomps; d > 0; --d) + { + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; } } diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index 55dbbde0..cce5cd85 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -596,7 +596,7 @@ namespace ojph { { friend ::ojph::param_qcd; public: - param_qcd() : reversible_SPqcd_shift(3), old_SPqcd(false) + param_qcd() { Lqcd = 0; Sqcd = 0; @@ -650,23 +650,12 @@ namespace ojph { void set_irrev_quant(ui32 num_decomps); ui8 decode_SPqcd(ui8 v) const - { - if (old_SPqcd) return (ui8)(v >> reversible_SPqcd_shift); // old - else { - v = v & 0b11111011; - return (ui8)((v << 5) | (v >> 3)); // new - } - } + { return (ui8)(v >> 3); } ui8 encode_SPqcd(ui8 v) const - { - if (old_SPqcd) return (ui8)(v << reversible_SPqcd_shift); // old - else return (ui8)((v >> 5) | (v << 3)); // new - } + { return (ui8)(v << 3); } protected: ui16 Lqcd; ui8 Sqcd; - const ui8 reversible_SPqcd_shift; - const bool old_SPqcd; union { ui8 u8_SPqcd[97]; From 3292374d6a9504c0376dca75727caf60e7c09cd6 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 10 Nov 2024 09:11:01 +1100 Subject: [PATCH 72/78] This is to address issue #157. --- src/core/codestream/ojph_codeblock_fun.cpp | 9 +++++++-- src/core/coding/ojph_block_encoder.h | 3 +++ src/core/coding/ojph_block_encoder_avx2.cpp | 19 ++++++++---------- src/core/coding/ojph_block_encoder_avx512.cpp | 20 ++++++++----------- 4 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index 45504983..2ddb3923 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -205,6 +205,7 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { + decode_cb32 = ojph_decode_codeblock_avx2; find_max_val32 = avx2_find_max_val32; if (reversible) { tx_to_cb32 = avx2_rev_tx_to_cb32; @@ -215,7 +216,8 @@ namespace ojph { tx_from_cb32 = avx2_irv_tx_from_cb32; } encode_cb32 = ojph_encode_codeblock_avx2; - decode_cb32 = ojph_decode_codeblock_avx2; + bool result = initialize_block_encoder_tables_avx2(); + assert(result); find_max_val64 = avx2_find_max_val64; if (reversible) { @@ -231,8 +233,11 @@ namespace ojph { #endif // !OJPH_DISABLE_AVX2 #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) { encode_cb32 = ojph_encode_codeblock_avx512; + bool result = initialize_block_encoder_tables_avx512(); + assert(result); + } #endif // !OJPH_DISABLE_AVX512 #elif defined(OJPH_ARCH_ARM) diff --git a/src/core/coding/ojph_block_encoder.h b/src/core/coding/ojph_block_encoder.h index d2782fb9..72b3c0d7 100644 --- a/src/core/coding/ojph_block_encoder.h +++ b/src/core/coding/ojph_block_encoder.h @@ -78,6 +78,9 @@ namespace ojph { ui32 stride, ui32* lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *& coded); + + bool initialize_block_encoder_tables_avx2(); + bool initialize_block_encoder_tables_avx512(); } } diff --git a/src/core/coding/ojph_block_encoder_avx2.cpp b/src/core/coding/ojph_block_encoder_avx2.cpp index 6f3db34e..7624272d 100644 --- a/src/core/coding/ojph_block_encoder_avx2.cpp +++ b/src/core/coding/ojph_block_encoder_avx2.cpp @@ -218,22 +218,19 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - bool initialize_tables_avx2() { - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { + static bool tables_initialized = false; + + ///////////////////////////////////////////////////////////////////////// + bool initialize_block_encoder_tables_avx2() { + if (!tables_initialized) { memset(vlc_tbl0, 0, 2048 * sizeof(ui32)); memset(vlc_tbl1, 0, 2048 * sizeof(ui32)); - - bool result; - result = vlc_init_tables(); - result = result && uvlc_init_tables(); - return result; + tables_initialized = vlc_init_tables(); + tables_initialized = tables_initialized && uvlc_init_tables(); } - return false; + return tables_initialized; } - ///////////////////////////////////////////////////////////////////////// - static bool tables_initialized = initialize_tables_avx2(); - ///////////////////////////////////////////////////////////////////////// // ///////////////////////////////////////////////////////////////////////// diff --git a/src/core/coding/ojph_block_encoder_avx512.cpp b/src/core/coding/ojph_block_encoder_avx512.cpp index f0c7438b..b35373a7 100644 --- a/src/core/coding/ojph_block_encoder_avx512.cpp +++ b/src/core/coding/ojph_block_encoder_avx512.cpp @@ -218,23 +218,19 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - bool initialize_tables() { - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) - { + static bool tables_initialized = false; + + ///////////////////////////////////////////////////////////////////////// + bool initialize_block_encoder_tables_avx512() { + if (!tables_initialized) { memset(vlc_tbl0, 0, 2048 * sizeof(ui32)); memset(vlc_tbl1, 0, 2048 * sizeof(ui32)); - - bool result; - result = vlc_init_tables(); - result = result && uvlc_init_tables(); - return result; + tables_initialized = vlc_init_tables(); + tables_initialized = tables_initialized && uvlc_init_tables(); } - return false; + return tables_initialized; } - ///////////////////////////////////////////////////////////////////////// - static bool tables_initialized = initialize_tables(); - ///////////////////////////////////////////////////////////////////////// // ///////////////////////////////////////////////////////////////////////// From c09dfa0a2b1c5e143f07c082df833f2f6abfb5d0 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 10 Nov 2024 09:13:46 +1100 Subject: [PATCH 73/78] Address warnings in Linux --- src/core/codestream/ojph_codeblock_fun.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index 2ddb3923..08d8d732 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -217,7 +217,7 @@ namespace ojph { } encode_cb32 = ojph_encode_codeblock_avx2; bool result = initialize_block_encoder_tables_avx2(); - assert(result); + assert(result); ojph_unused(result); find_max_val64 = avx2_find_max_val64; if (reversible) { @@ -236,7 +236,7 @@ namespace ojph { if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) { encode_cb32 = ojph_encode_codeblock_avx512; bool result = initialize_block_encoder_tables_avx512(); - assert(result); + assert(result); ojph_unused(result); } #endif // !OJPH_DISABLE_AVX512 From 631d7814d66000ae72f8f15e38503f4a63459269 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 10 Nov 2024 09:58:55 +1100 Subject: [PATCH 74/78] This modfied raw file processing to support 32bits. --- src/apps/common/ojph_img_io.h | 2 +- src/apps/others/ojph_img_io.cpp | 164 ++++++++++++++++++++++---------- 2 files changed, 115 insertions(+), 51 deletions(-) diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h index 9f7f3356..a9ee243f 100644 --- a/src/apps/common/ojph_img_io.h +++ b/src/apps/common/ojph_img_io.h @@ -760,7 +760,7 @@ namespace ojph { const char* fname; bool is_signed; ui32 bit_depth, bytes_per_sample; - si32 lower_val, upper_val; + si64 lower_val, upper_val; ui32 width; ui8* buffer; ui32 buffer_size; diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index d8120251..22ca2432 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -1618,11 +1618,11 @@ namespace ojph { this->width = width; if (is_signed) { - upper_val = (1 << (bit_depth - 1)); - lower_val = -(1 << (bit_depth - 1)); + upper_val = (1LL << (bit_depth - 1)); + lower_val = -(1LL << (bit_depth - 1)); } else { - upper_val = 1 << bit_depth; - lower_val = 0; + upper_val = 1LL << bit_depth; + lower_val = 0LL; } bytes_per_sample = (bit_depth + 7) >> 3; @@ -1637,63 +1637,127 @@ namespace ojph { assert(fh); assert(comp_num == 0); - if (bytes_per_sample > 3) + if (is_signed) { - const si32* sp = line->i32; - ui32* dp = (ui32*)buffer; - for (ui32 i = width; i > 0; --i) + if (bytes_per_sample > 3) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp++ = (ui32)val; + const si32* sp = line->i32; + si32* dp = (si32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (si32)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000151, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x03000151, "unable to write to file %s", fname); - } - else if (bytes_per_sample > 2) - { - const si32* sp = line->i32; - ui32* dp = (ui32*)buffer; - for (ui32 i = width; i > 0; --i) + else if (bytes_per_sample > 2) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp = (ui32)val; - // this only works for little endian architecture - dp = (ui32*)((ui8*)dp + 3); + const si32* sp = line->i32; + si32* dp = (si32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp = (si32)val; + // this only works for little endian architecture + dp = (si32*)((ui8*)dp + 3); + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000152, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x03000152, "unable to write to file %s", fname); - } - else if (bytes_per_sample > 1) - { - const si32* sp = line->i32; - ui16* dp = (ui16*)buffer; - for (ui32 i = width; i > 0; --i) + else if (bytes_per_sample > 1) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp++ = (ui16)val; + const si32* sp = line->i32; + si16* dp = (si16*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (si16)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000153, "unable to write to file %s", fname); + } + else + { + const si32* sp = line->i32; + si8* dp = (si8*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (si8)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000154, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x03000153, "unable to write to file %s", fname); } - else + else { - const si32* sp = line->i32; - ui8* dp = (ui8*)buffer; - for (ui32 i = width; i > 0; --i) + if (bytes_per_sample > 3) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp++ = (ui8)val; + const ui32* sp = (ui32*)line->i32; + ui32* dp = (ui32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (ui32)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000155, "unable to write to file %s", fname); + } + else if (bytes_per_sample > 2) + { + const ui32* sp = (ui32*)line->i32; + ui32* dp = (ui32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp = (ui32)val; + // this only works for little endian architecture + dp = (ui32*)((ui8*)dp + 3); + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000156, "unable to write to file %s", fname); + } + else if (bytes_per_sample > 1) + { + const ui32* sp = (ui32*)line->i32; + ui16* dp = (ui16*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (ui16)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000157, "unable to write to file %s", fname); + } + else + { + const ui32* sp = (ui32*)line->i32; + ui8* dp = (ui8*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (ui8)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000158, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x03000154, "unable to write to file %s", fname); } return width; From 576fef99a98eef26f3bfdc04a51a23b45cd7d22e Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 10 Nov 2024 10:23:44 +1100 Subject: [PATCH 75/78] Changes the messages of ojph_compress. --- src/apps/ojph_compress/ojph_compress.cpp | 30 +++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index 9dbdefb1..144c8370 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -592,20 +592,25 @@ int main(int argc, char * argv[]) { ".pfm files receive special treatment. Currently, lossy compression\n" "with these files is not supported, only lossless. When these files are\n" "used, the NLT segment marker is automatically inserted into the\n" - "codestream. For these files the following arguments can be useful\n" - " -signed a comma - separated list of true or false parameters, one\n" + "codestream when needed, as explained shortly. The following arguments\n" + "can be useful for this file type.\n" + " -signed a comma-separated list of true or false parameters, one\n" " for each component; for example: true,false,false.\n" - " The sign only affects how values are treated; for negative\n" - " values the standard requires a special non-linear\n" - " transformation. When signed is false, no transformation\n" - " is employed, as we assume all values are 0 or positive.\n" - " When signed is true, the aforementioned transformation is\n" - " employed on negative values only.\n" + " If you are sure that all sample values are positive or 0,\n" + " set the corresponding entry to false; otherwise set it to\n" + " true.\n" + " When a component entry is set to true, an NLT segment\n" + " marker segment is inserted into the codestream.\n" + " The NLT segment specifies a non-linear transform that\n" + " changes only negative values, producing better coding\n" + " efficiency.\n" + " The NLT segment marker might be less supported in other\n" + " encoders.\n" " -bit_depth a comma-separated list of bit depth values, one per \n" " component; for example: 12,10,10.\n" " Floating value numbers are treated as integers, and they\n" " are shifted to the right, keeping only the specified\n" - " number of bits. Note that a bit depth of 28 upwards is not\n" + " number of bits. Up to 32 bits (which is the default) are\n" " supported.\n" "\n"; @@ -836,11 +841,8 @@ int main(int argc, char * argv[]) { nlt.set_type3_transformation(c, true); } else - OJPH_ERROR(0x01000093, "The support for pfm image is not " - "complete; I need to figure how to modify the interface " - "to better support the exchange of floating point data. " - "Feeding float point data is not supported yet, unless it " - "is for lossless compression."); + OJPH_ERROR(0x01000093, "We currently support lossless only for " + "pfm images; this may change in the future."); codestream.set_planar(false); if (profile_string[0] != '\0') From 6541f2c1cdfbfafe60607bbe1b54b9756843ee82 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 10 Nov 2024 11:07:48 +1100 Subject: [PATCH 76/78] This address codeQL --- .github/workflows/codeql.yml | 8 ++++---- src/apps/others/ojph_img_io.cpp | 26 ++++++++++++------------- src/core/codestream/ojph_codeblock.cpp | 4 ++-- src/core/codestream/ojph_resolution.cpp | 14 ++++++++----- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index c2d527ad..7d031b59 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -46,11 +46,11 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -64,7 +64,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 # ℹī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -77,6 +77,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{matrix.language}}" diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index 22ca2432..05ce4df5 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -329,9 +329,9 @@ namespace ojph { return; if (bytes_per_sample == 1) - temp_buf = alloc_p->post_alloc_data(num_comps * width, 0); + temp_buf = alloc_p->post_alloc_data(num_comps * (size_t)width, 0); else - temp_buf = alloc_p->post_alloc_data(num_comps * width, 0); + temp_buf = alloc_p->post_alloc_data(num_comps * (size_t)width, 0); } ///////////////////////////////////////////////////////////////////////////// @@ -408,7 +408,7 @@ namespace ojph { "unable to open file %s for writing", filename); fprintf(fh, "P5\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1); - buffer_size = width * bytes_per_sample; + buffer_size = (size_t)width * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } else @@ -435,7 +435,7 @@ namespace ojph { fprintf(fh, "P6\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1); if (result == 0) OJPH_ERROR(0x03000027, "error writing to file %s", filename); - buffer_size = width * num_components * bytes_per_sample; + buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } fname = filename; @@ -935,12 +935,12 @@ namespace ojph { // the first time trying to access this line if (PLANARCONFIG_SEPARATE == planar_configuration && 0 == comp_num ) { - for (unsigned short color = 0; color < num_comps; color++) + for (ui32 color = 0; color < num_comps; color++) { if (bytes_per_sample == 1) { TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint8, - cur_line, color); + cur_line, (ui16)color); ui32 x = color; uint8_t* line_buffer_of_interleaved_components = (uint8_t*)line_buffer; @@ -953,7 +953,7 @@ namespace ojph { else if (bytes_per_sample == 2) { TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint16, - cur_line, color); + cur_line, (ui16)color); ui32 x = color; ui16* line_buffer_of_interleaved_components = (ui16*)line_buffer; for (ui32 i = 0; i < width; i++, x += num_comps) @@ -1070,7 +1070,7 @@ namespace ojph { OJPH_ERROR(0x030000B3, "unable to open file %s for writing", filename); } - buffer_size = width * num_components * bytes_per_sample; + buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample; buffer = (ui8*)malloc(buffer_size); fname = filename; cur_line = 0; @@ -1146,7 +1146,7 @@ namespace ojph { bytes_per_sample = 2; } samples_per_line = num_components * width; - bytes_per_line = bytes_per_sample * samples_per_line; + bytes_per_line = bytes_per_sample * (size_t)samples_per_line; } @@ -1482,7 +1482,7 @@ namespace ojph { cur_line = 0; bytes_per_sample = (bit_depth + 7) >> 3; - buffer_size = width * bytes_per_sample; + buffer_size = (size_t)width * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); fname = filename; } @@ -1626,7 +1626,7 @@ namespace ojph { } bytes_per_sample = (bit_depth + 7) >> 3; - buffer_size = width * bytes_per_sample; + buffer_size = (size_t)width * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } @@ -2004,11 +2004,11 @@ namespace ojph { // allocate line_buffer_16bit_samples to hold a line of image data in memory line_buffer_16bit_samples = - (ui16*) malloc(width * num_comps * sizeof(ui16)); + (ui16*) malloc((size_t)width * num_comps * sizeof(ui16)); if (NULL == line_buffer_16bit_samples) OJPH_ERROR(0x03000179, "Unable to allocate %d bytes for " "line_buffer_16bit_samples[] for file %s", - width * num_comps * sizeof(ui16), filename); + (size_t)width * num_comps * sizeof(ui16), filename); cur_line = 0; diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp index bd76fb3f..351284bf 100644 --- a/src/core/codestream/ojph_codeblock.cpp +++ b/src/core/codestream/ojph_codeblock.cpp @@ -66,9 +66,9 @@ namespace ojph { const param_cod* cd = codestream->get_cod(comp_num); ui32 precision = cd->propose_implementation_precision(sz); if (precision <= 32) - allocator->pre_alloc_data(nominal.h * stride, 0); + allocator->pre_alloc_data(nominal.h * (size_t)stride, 0); else - allocator->pre_alloc_data(nominal.h * stride, 0); + allocator->pre_alloc_data(nominal.h * (size_t)stride, 0); } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp index bcb27c98..02464001 100644 --- a/src/core/codestream/ojph_resolution.cpp +++ b/src/core/codestream/ojph_resolution.cpp @@ -709,7 +709,8 @@ namespace ojph { bands[1].pull_line(), width, horz_even); else memcpy(aug->line->p, child_res->pull_line()->p, - width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); + (size_t)width + * (aug->line->flags & line_buf::LFT_SIZE_MASK)); aug->active = true; vert_even = !vert_even; ++cur_line; @@ -721,7 +722,8 @@ namespace ojph { bands[3].pull_line(), width, horz_even); else memcpy(sig->line->p, bands[2].pull_line()->p, - width * (sig->line->flags & line_buf::LFT_SIZE_MASK)); + (size_t)width + * (sig->line->flags & line_buf::LFT_SIZE_MASK)); sig->active = true; vert_even = !vert_even; ++cur_line; @@ -760,7 +762,8 @@ namespace ojph { bands[1].pull_line(), width, horz_even); else memcpy(aug->line->p, child_res->pull_line()->p, - width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); + (size_t)width + * (aug->line->flags & line_buf::LFT_SIZE_MASK)); } else { @@ -769,7 +772,8 @@ namespace ojph { bands[3].pull_line(), width, horz_even); else memcpy(aug->line->p, bands[2].pull_line()->p, - width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); + (size_t)width + * (aug->line->flags & line_buf::LFT_SIZE_MASK)); if (aug->line->flags & line_buf::LFT_32BIT) { si32* sp = aug->line->i32; @@ -891,7 +895,7 @@ namespace ojph { bands[1].pull_line(), width, horz_even); else memcpy(aug->line->p, child_res->pull_line()->p, - width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); + (size_t)width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); return aug->line; } else From dd011ac149e54250224f8548d4752f5de67e518b Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 10 Nov 2024 11:34:19 +1100 Subject: [PATCH 77/78] Addresses a couple of warnings. --- src/apps/common/ojph_img_io.h | 2 +- src/apps/others/ojph_img_io.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h index a9ee243f..c18ee76e 100644 --- a/src/apps/common/ojph_img_io.h +++ b/src/apps/common/ojph_img_io.h @@ -683,7 +683,7 @@ namespace ojph { ui32 bit_depth_of_data[4]; ui32 bytes_per_sample; ui8* buffer; - ui32 buffer_size; + size_t buffer_size; ui32 cur_line, samples_per_line; }; #endif /* OJPH_ENABLE_TIFF_SUPPORT */ diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index 05ce4df5..e77c5b78 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -1070,7 +1070,7 @@ namespace ojph { OJPH_ERROR(0x030000B3, "unable to open file %s for writing", filename); } - buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample; + buffer_size = width * (size_t)num_components * (size_t)bytes_per_sample; buffer = (ui8*)malloc(buffer_size); fname = filename; cur_line = 0; @@ -1626,7 +1626,7 @@ namespace ojph { } bytes_per_sample = (bit_depth + 7) >> 3; - buffer_size = (size_t)width * bytes_per_sample; + buffer_size = width * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } From 811f3a2eb19b4a0d797e682b496489b5e83547e3 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 10 Nov 2024 11:41:23 +1100 Subject: [PATCH 78/78] Address warnings in clang. --- src/apps/others/ojph_img_io.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index e77c5b78..89b81279 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -1618,11 +1618,11 @@ namespace ojph { this->width = width; if (is_signed) { - upper_val = (1LL << (bit_depth - 1)); - lower_val = -(1LL << (bit_depth - 1)); + upper_val = ((si64)1 << (bit_depth - 1)); + lower_val = -((si64)1 << (bit_depth - 1)); } else { - upper_val = 1LL << bit_depth; - lower_val = 0LL; + upper_val = (si64)1 << bit_depth; + lower_val = (si64)0; } bytes_per_sample = (bit_depth + 7) >> 3;