Skip to content

Commit

Permalink
Add stride access for DWT with AVX2
Browse files Browse the repository at this point in the history
  • Loading branch information
osamu620 committed Nov 15, 2024
1 parent c8abe53 commit 7e77349
Show file tree
Hide file tree
Showing 12 changed files with 228 additions and 219 deletions.
8 changes: 4 additions & 4 deletions source/apps/imgcmp/image_class.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class image {
bitDepth(0),
isSigned(false),
isBigendian(false),
data(nullptr) {};
data(nullptr){};
// // destructor
~image() { delete[] data; }

Expand Down Expand Up @@ -110,9 +110,9 @@ class image {

// parsing PNM/PGX header
int read_pnmpgx(const char *name) {
constexpr char SP = ' ';
constexpr char LF = '\n';
constexpr char CR = 13;
constexpr char SP = ' ';
constexpr char LF = '\n';
[[maybe_unused]] constexpr char CR = 13;

FILE *fp = fopen(name, "rb");
if (fp == nullptr) {
Expand Down
24 changes: 12 additions & 12 deletions source/core/coding/coding_units.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,10 @@ void j2k_codeblock::create_compressed_buffer(buf_chain *tile_buf, int32_t buf_li
*******************************************************************************/
j2k_precinct_subband::j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uint8_t R_b,
uint8_t transformation, float stepsize, sprec_t *ibuf,
const element_siz &bp0, const element_siz &bp1,
const element_siz &p0, const element_siz &p1,
const uint32_t band_stride, const uint16_t &num_layers,
const element_siz &codeblock_size, const uint8_t &Cmodes)
const element_siz &bp0, const element_siz &p0,
const element_siz &p1, const uint32_t band_stride,
const uint16_t &num_layers, const element_siz &codeblock_size,
const uint8_t &Cmodes)
: j2k_region(p0, p1),
orientation(orientation),
inclusion_info(nullptr),
Expand Down Expand Up @@ -1028,8 +1028,8 @@ j2k_precinct::j2k_precinct(const uint8_t &r, const uint32_t &idx, const element_
ceil_int(pos1.y - yob[subband[i]->orientation], sr));
this->pband[i] = MAKE_UNIQUE<j2k_precinct_subband>(
subband[i]->orientation, subband[i]->M_b, subband[i]->R_b, subband[i]->transformation,
subband[i]->delta, subband[i]->i_samples, subband[i]->pos0, subband[i]->pos1, pbpos0, pbpos1,
subband[i]->stride, num_layers, codeblock_size, Cmodes);
subband[i]->delta, subband[i]->i_samples, subband[i]->pos0, pbpos0, pbpos1, subband[i]->stride,
num_layers, codeblock_size, Cmodes);
}
}

Expand Down Expand Up @@ -1402,7 +1402,7 @@ void j2k_tile_component::init(j2k_main_header *hdr, j2k_tilepart_header *tphdr,
const uint32_t aligned_stride =
round_up((ceil_int(pos1.x, 1U << tile->reduce_NL) - ceil_int(pos0.x, 1U << tile->reduce_NL)), 32U);
const auto height = static_cast<uint32_t>(ceil_int(pos1.y, 1U << tile->reduce_NL)
- ceil_int(pos0.y, 1U << tile->reduce_NL));
- ceil_int(pos0.y, 1U << tile->reduce_NL));
const uint32_t num_bufsamples = aligned_stride * height;
samples = static_cast<int32_t *>(aligned_mem_alloc(sizeof(int32_t) * num_bufsamples, 32));

Expand Down Expand Up @@ -2459,7 +2459,7 @@ void j2k_tile::decode() {
#endif
}
} // end of codeblock loop
} // end of subbnad loop
} // end of subbnad loop
#ifdef OPENHTJ2K_THREAD
for (auto &result : results) {
result.get();
Expand Down Expand Up @@ -2507,7 +2507,7 @@ void j2k_tile::decode() {

// copy samples in resolution buffer to that in tile component buffer
uint32_t height = tc1.y - tc0.y;
uint32_t width = round_up(tc1.x - tc0.x, 32);
uint32_t width = round_up(tc1.x - tc0.x, 32U);
uint32_t stride = round_up(width, 32U);
// size_t num_samples = static_cast<size_t>(tc1.x - tc0.x) * (tc1.y - tc0.y);
#if defined(OPENHTJ2K_ENABLE_ARM_NEON)
Expand Down Expand Up @@ -3013,7 +3013,7 @@ uint8_t *j2k_tile::encode() {
#if defined(OPENHTJ2K_TRY_AVX2) && defined(__AVX2__)
for (uint32_t y = 0; y < height; ++y) {
int32_t *sp = src + y * stride;
sprec_t *dp = cr->i_samples + y * (bottom_right.x - top_left.x);
sprec_t *dp = cr->i_samples + y * stride;
uint32_t num_tc_samples = bottom_right.x - top_left.x;
for (; num_tc_samples >= 16; num_tc_samples -= 16) {
__m256i v0 = _mm256_load_si256((__m256i *)sp);
Expand All @@ -3030,7 +3030,7 @@ uint8_t *j2k_tile::encode() {
#elif defined(OPENHTJ2K_ENABLE_ARM_NEON)
for (uint32_t y = 0; y < height; ++y) {
int32_t *sp = src + y * stride;
sprec_t *dp = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32);
sprec_t *dp = cr->i_samples + y * stride;
uint32_t num_tc_samples = bottom_right.x - top_left.x;
for (; num_tc_samples >= 8; num_tc_samples -= 8) {
auto vsrc0 = vld1q_s32(sp);
Expand All @@ -3046,7 +3046,7 @@ uint8_t *j2k_tile::encode() {
#else
for (uint32_t y = 0; y < height; ++y) {
int32_t *sp = src + y * stride;
sprec_t *dp = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32);
sprec_t *dp = cr->i_samples + y * round_up(bottom_right.x - top_left.x, 32U);
uint32_t num_tc_samples = bottom_right.x - top_left.x;
for (; num_tc_samples > 0; --num_tc_samples) {
*dp++ = static_cast<sprec_t>(*sp++);
Expand Down
17 changes: 8 additions & 9 deletions source/core/coding/coding_units.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class j2k_region {
// set bottom-right coordinate (exclusive)
void set_pos1(element_siz in) { pos1 = in; }
j2k_region() = default;
j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1), stride(round_up(pos1.x - pos0.x, 32)) {}
j2k_region(element_siz p0, element_siz p1) : pos0(p0), pos1(p1), stride(round_up(pos1.x - pos0.x, 32U)) {}
};

/********************************************************************************
Expand Down Expand Up @@ -179,10 +179,9 @@ class j2k_precinct_subband : public j2k_region {
uint32_t num_codeblock_x;
uint32_t num_codeblock_y;
j2k_precinct_subband(uint8_t orientation, uint8_t M_b, uint8_t R_b, uint8_t transformation,
float stepsize, sprec_t *ibuf, const element_siz &bp0, const element_siz &bp1,
const element_siz &p0, const element_siz &p1, const uint32_t stride,
const uint16_t &num_layers, const element_siz &codeblock_size,
const uint8_t &Cmodes);
float stepsize, sprec_t *ibuf, const element_siz &bp0, const element_siz &p0,
const element_siz &p1, const uint32_t stride, const uint16_t &num_layers,
const element_siz &codeblock_size, const uint8_t &Cmodes);
~j2k_precinct_subband() {
delete inclusion_info;
delete ZBP_info;
Expand Down Expand Up @@ -257,7 +256,7 @@ class j2c_packet {
uint32_t length;

j2c_packet()
: layer(0), resolution(0), component(0), precinct(0), header(nullptr), body(nullptr), length(0) {};
: layer(0), resolution(0), component(0), precinct(0), header(nullptr), body(nullptr), length(0){};
// constructor for decoding
j2c_packet(const uint16_t l, const uint8_t r, const uint16_t c, const uint32_t p,
buf_chain *const h = nullptr, buf_chain *const bo = nullptr)
Expand Down Expand Up @@ -316,7 +315,7 @@ class j2k_resolution : public j2k_region {
void scale();
void destroy() {
aligned_mem_free(i_samples);
for (auto b = 0; b < num_bands; ++b) {
for (uint8_t b = 0; b < num_bands; ++b) {
if (subbands != nullptr) {
subbands[b]->destroy();
}
Expand Down Expand Up @@ -424,7 +423,7 @@ class j2k_tile_component : public j2k_tile_base {
void perform_dc_offset(uint8_t transformation, bool is_signed);

void destroy() {
for (auto r = 0; r < this->NL; ++r) {
for (uint8_t r = 0; r < this->NL; ++r) {
if (resolution != nullptr) {
auto p = resolution[r].get();
if (p != nullptr) resolution[r]->destroy();
Expand Down Expand Up @@ -495,7 +494,7 @@ class j2k_tile : public j2k_tile_base {
public:
j2k_tile();
void destroy() {
for (auto c = 0; c < this->num_components; ++c) {
for (uint16_t c = 0; c < this->num_components; ++c) {
tcomp[c].destroy();
}
}
Expand Down
38 changes: 19 additions & 19 deletions source/core/coding/ht_block_decoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
for (uint32_t x = 0; x < block->size.x; sp += 2, ++vp) {
uint32_t inf = sp[0];
uint32_t U_q = sp[1];
if (U_q > ((30 - pLSB) + 2)) {
if (U_q > ((30U - pLSB) + 2U)) {
printf("ERROR\n");
}

Expand All @@ -279,14 +279,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
MagSgn.advance(m_n); // consume m_n

val = ms_val << 31; // get sign bit
v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits
v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits
v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
v_n |= 1; // add center of bin
// v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
// add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
val |= (v_n + 2) << (pLSB - 1);
}
dp[0] = val;
dp[0] = static_cast<int32_t>(val);

v_n = 0;
val = 0;
Expand All @@ -298,14 +298,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
MagSgn.advance(m_n); // consume m_n

val = ms_val << 31; // get sign bit
v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits
v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits
v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
v_n |= 1; // add center of bin
// v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
// add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
val |= (v_n + 2) << (pLSB - 1);
}
dp[block->blksampl_stride] = val;
dp[block->blksampl_stride] = static_cast<int32_t>(val);
vp[0] = prev_v_n | v_n;
prev_v_n = 0;
++dp;
Expand All @@ -323,14 +323,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
MagSgn.advance(m_n); // consume m_n

val = ms_val << 31; // get sign bit
v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits
v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits
v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
v_n |= 1; // add center of bin
// v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
// add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
val |= (v_n + 2) << (pLSB - 1);
}
dp[0] = val;
dp[0] = static_cast<int32_t>(val);

v_n = 0;
val = 0;
Expand All @@ -342,22 +342,22 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
MagSgn.advance(m_n); // consume m_n

val = ms_val << 31; // get sign bit
v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits
v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits
v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
v_n |= 1; // add center of bin
// v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
// add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
val |= (v_n + 2) << (pLSB - 1);
}
dp[block->blksampl_stride] = val;
dp[block->blksampl_stride] = static_cast<int32_t>(val);
prev_v_n = v_n;
++dp;
++x;
}
vp[0] = prev_v_n;

for (uint32_t y = 2; y < block->size.y; y += 2) {
uint16_t *sp = scratch + (y >> 1) * sstr;
uint16_t *sp = scratch + (y >> 1) * static_cast<uint32_t>(sstr);
uint32_t *vp = v_n_scratch;
int32_t *dp = block->sample_buf + y * block->blksampl_stride;

Expand All @@ -373,7 +373,7 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
uint32_t kappa = gamma ? emax : 1;

uint32_t U_q = u_q + kappa;
if (U_q > ((30 - pLSB) + 2)) {
if (U_q > ((30U - pLSB) + 2U)) {
printf("ERROR\n");
}

Expand All @@ -387,14 +387,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
MagSgn.advance(m_n); // consume m_n

val = ms_val << 31; // get sign bit
v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits
v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits
v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
v_n |= 1; // add center of bin
// v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
// add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
val |= (v_n + 2) << (pLSB - 1);
}
dp[0] = val;
dp[0] = static_cast<int32_t>(val);

v_n = 0;
val = 0;
Expand All @@ -406,14 +406,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
MagSgn.advance(m_n); // consume m_n

val = ms_val << 31; // get sign bit
v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits
v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits
v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
v_n |= 1; // add center of bin
// v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
// add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
val |= (v_n + 2) << (pLSB - 1);
}
dp[block->blksampl_stride] = val;
dp[block->blksampl_stride] = static_cast<int32_t>(val);
vp[0] = prev_v_n | v_n;
prev_v_n = 0;
++dp;
Expand All @@ -431,14 +431,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
MagSgn.advance(m_n); // consume m_n

val = ms_val << 31; // get sign bit
v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits
v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits
v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
v_n |= 1; // add center of bin
// v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
// add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
val |= (v_n + 2) << (pLSB - 1);
}
dp[0] = val;
dp[0] = static_cast<int32_t>(val);

v_n = 0;
val = 0;
Expand All @@ -450,14 +450,14 @@ void ht_cleanup_decode(j2k_codeblock *block, const uint8_t &pLSB, const int32_t
MagSgn.advance(m_n); // consume m_n

val = ms_val << 31; // get sign bit
v_n = ms_val & ((1 << m_n) - 1); // keep only m_n bits
v_n = ms_val & ((1U << m_n) - 1U); // keep only m_n bits
v_n |= ((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
v_n |= 1; // add center of bin
// v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
// add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
val |= (v_n + 2) << (pLSB - 1);
}
dp[block->blksampl_stride] = val;
dp[block->blksampl_stride] = static_cast<int32_t>(val);
prev_v_n = v_n;
++dp;
++x;
Expand Down
2 changes: 1 addition & 1 deletion source/core/coding/ht_block_decoding.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,7 @@ class fwd_buf {
*/
FORCE_INLINE void advance(uint32_t num_bits) {
// if (!num_bits) return;
if (!(num_bits >= 0 && num_bits <= this->bits && num_bits < 128)) {
if (!(num_bits <= this->bits && num_bits < 128)) {
printf("Value of numbits = %d is out of range.\n", num_bits);
throw std::exception();
}
Expand Down
10 changes: 5 additions & 5 deletions source/core/coding/ht_block_encoding_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,22 @@ class state_MS_enc {
uint32_t t = 0;

// _bzhi_u32(UINT32_MAX, len) = ((1U << len) - 1U)
tmp = val & ((1 << (8 - stuff)) - 1);
tmp = val & ((1U << (8U - stuff)) - 1U);
t |= tmp;
bits_local += 8 - stuff;
stuff = (tmp == 0xFF);

tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1);
tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U);
t |= tmp << 8;
bits_local += 8 - stuff;
stuff = (tmp == 0xFF);

tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1);
tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U);
t |= tmp << 16;
bits_local += 8 - stuff;
stuff = (tmp == 0xFF);

tmp = (val >> (bits_local)) & ((1 << (8 - stuff)) - 1);
tmp = (val >> (bits_local)) & ((1U << (8U - stuff)) - 1U);
t |= tmp << 24;
bits_local += 8 - stuff;
last = tmp & 0xFF;
Expand Down Expand Up @@ -168,7 +168,7 @@ class state_MS_enc {
}
#else
for (int i = 0; i < 4; ++i) {
Creg |= static_cast<__uint128_t>(_mm_cvtsi128_si32(v)) << ctreg;
Creg |= static_cast<uint64_t>(static_cast<__uint128_t>(_mm_cvtsi128_si32(v)) << ctreg);
ctreg += static_cast<uint32_t>(_mm_cvtsi128_si32(m));
v = _mm_srli_si128(v, 4);
m = _mm_srli_si128(m, 4);
Expand Down
2 changes: 1 addition & 1 deletion source/core/transform/color.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#if !defined(OPENHTJ2K_TRY_AVX2) && !defined(__AVX2__) && !defined(OPENHTJ2K_ENABLE_ARM_NEON)
#if not defined(OPENHTJ2K_TRY_AVX2) && not defined(OPENHTJ2K_ENABLE_ARM_NEON)
#include "color.hpp"

void cvt_rgb_to_ycbcr_rev(int32_t *sp0, int32_t *sp1, int32_t *sp2, uint32_t width, uint32_t height) {
Expand Down
Loading

0 comments on commit 7e77349

Please sign in to comment.