Skip to content

Commit 5df0f8c

Browse files
authored
Merge pull request #183 from aous72/unnecessarily_coded_blocks_bug
This fixes a bug that results in coding all-zero codeblocks, producing files that are larger than they need to be. The increase in size is not fixed and can change from one run to the other. The bug is on the SIMD accelerated paths only.
2 parents 3181392 + b705ee1 commit 5df0f8c

File tree

4 files changed

+179
-13
lines changed

4 files changed

+179
-13
lines changed

src/core/codestream/ojph_codestream_avx2.cpp

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ namespace ojph {
8888
__m256i m0 = _mm256_set1_epi32(INT_MIN);
8989
__m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
9090
__m256i *p = (__m256i*)sp;
91-
for (ui32 i = 0; i < count; i += 8, p += 1, dp += 8)
91+
for ( ; count >= 8; count -= 8, p += 1, dp += 8)
9292
{
9393
__m256i v = _mm256_loadu_si256(p);
9494
__m256i sign = _mm256_and_si256(v, m0);
@@ -98,6 +98,22 @@ namespace ojph {
9898
val = _mm256_or_si256(val, sign);
9999
_mm256_storeu_si256((__m256i*)dp, val);
100100
}
101+
if (count)
102+
{
103+
__m256i v = _mm256_loadu_si256(p);
104+
__m256i sign = _mm256_and_si256(v, m0);
105+
__m256i val = _mm256_abs_epi32(v);
106+
val = _mm256_slli_epi32(val, (int)shift);
107+
108+
__m256i c = _mm256_set1_epi32((si32)count);
109+
__m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
110+
__m256i mask = _mm256_cmpgt_epi32(c, idx);
111+
c = _mm256_and_si256(val, mask);
112+
tmax = _mm256_or_si256(tmax, c);
113+
114+
val = _mm256_or_si256(val, sign);
115+
_mm256_storeu_si256((__m256i*)dp, val);
116+
}
101117
_mm256_storeu_si256((__m256i*)max_val, tmax);
102118
}
103119

@@ -113,7 +129,7 @@ namespace ojph {
113129
__m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
114130
float *p = (float*)sp;
115131

116-
for (ui32 i = 0; i < count; i += 8, p += 8, dp += 8)
132+
for ( ; count >= 8; count -= 8, p += 8, dp += 8)
117133
{
118134
__m256 vf = _mm256_loadu_ps(p);
119135
vf = _mm256_mul_ps(vf, d); // multiply
@@ -124,6 +140,23 @@ namespace ojph {
124140
val = _mm256_or_si256(val, sign);
125141
_mm256_storeu_si256((__m256i*)dp, val);
126142
}
143+
if (count)
144+
{
145+
__m256 vf = _mm256_loadu_ps(p);
146+
vf = _mm256_mul_ps(vf, d); // multiply
147+
__m256i val = _mm256_cvtps_epi32(vf); // convert to int
148+
__m256i sign = _mm256_and_si256(val, m0); // get sign
149+
val = _mm256_abs_epi32(val);
150+
151+
__m256i c = _mm256_set1_epi32((si32)count);
152+
__m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
153+
__m256i mask = _mm256_cmpgt_epi32(c, idx);
154+
c = _mm256_and_si256(val, mask);
155+
tmax = _mm256_or_si256(tmax, c);
156+
157+
val = _mm256_or_si256(val, sign);
158+
_mm256_storeu_si256((__m256i*)dp, val);
159+
}
127160
_mm256_storeu_si256((__m256i*)max_val, tmax);
128161
}
129162

@@ -178,7 +211,7 @@ namespace ojph {
178211
__m256i one = _mm256_set1_epi64x(1);
179212
__m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
180213
__m256i *p = (__m256i*)sp;
181-
for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
214+
for ( ; count >= 4; count -= 4, p += 1, dp += 4)
182215
{
183216
__m256i v = _mm256_loadu_si256(p);
184217
__m256i sign = _mm256_cmpgt_epi64(zero, v);
@@ -191,6 +224,25 @@ namespace ojph {
191224
val = _mm256_or_si256(val, sign);
192225
_mm256_storeu_si256((__m256i*)dp, val);
193226
}
227+
if (count)
228+
{
229+
__m256i v = _mm256_loadu_si256(p);
230+
__m256i sign = _mm256_cmpgt_epi64(zero, v);
231+
__m256i val = _mm256_xor_si256(v, sign); // negate 1's complement
232+
__m256i ones = _mm256_and_si256(sign, one);
233+
val = _mm256_add_epi64(val, ones); // 2's complement
234+
sign = _mm256_and_si256(sign, m0);
235+
val = _mm256_slli_epi64(val, (int)shift);
236+
237+
__m256i c = _mm256_set1_epi64x(count);
238+
__m256i idx = _mm256_set_epi64x(3, 2, 1, 0);
239+
__m256i mask = _mm256_cmpgt_epi64(c, idx);
240+
c = _mm256_and_si256(val, mask);
241+
tmax = _mm256_or_si256(tmax, c);
242+
243+
val = _mm256_or_si256(val, sign);
244+
_mm256_storeu_si256((__m256i*)dp, val);
245+
}
194246
_mm256_storeu_si256((__m256i*)max_val, tmax);
195247
}
196248

src/core/codestream/ojph_codestream_sse2.cpp

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ namespace ojph {
8787
__m128i one = _mm_set1_epi32(1);
8888
__m128i tmax = _mm_loadu_si128((__m128i*)max_val);
8989
__m128i *p = (__m128i*)sp;
90-
for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
90+
for ( ; count >= 4; count -= 4, p += 1, dp += 4)
9191
{
9292
__m128i v = _mm_loadu_si128(p);
9393
__m128i sign = _mm_cmplt_epi32(v, zero);
@@ -100,6 +100,25 @@ namespace ojph {
100100
val = _mm_or_si128(val, sign);
101101
_mm_storeu_si128((__m128i*)dp, val);
102102
}
103+
if (count)
104+
{
105+
__m128i v = _mm_loadu_si128(p);
106+
__m128i sign = _mm_cmplt_epi32(v, zero);
107+
__m128i val = _mm_xor_si128(v, sign); // negate 1's complement
108+
__m128i ones = _mm_and_si128(sign, one);
109+
val = _mm_add_epi32(val, ones); // 2's complement
110+
sign = _mm_and_si128(sign, m0);
111+
val = _mm_slli_epi32(val, (int)shift);
112+
113+
__m128i c = _mm_set1_epi32((si32)count);
114+
__m128i idx = _mm_set_epi32(3, 2, 1, 0);
115+
__m128i mask = _mm_cmpgt_epi32(c, idx);
116+
c = _mm_and_si128(val, mask);
117+
tmax = _mm_or_si128(tmax, c);
118+
119+
val = _mm_or_si128(val, sign);
120+
_mm_storeu_si128((__m128i*)dp, val);
121+
}
103122
_mm_storeu_si128((__m128i*)max_val, tmax);
104123
}
105124

@@ -116,7 +135,7 @@ namespace ojph {
116135
__m128i one = _mm_set1_epi32(1);
117136
__m128i tmax = _mm_loadu_si128((__m128i*)max_val);
118137
float *p = (float*)sp;
119-
for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
138+
for ( ; count >= 4; count -= 4, p += 4, dp += 4)
120139
{
121140
__m128 vf = _mm_loadu_ps(p);
122141
vf = _mm_mul_ps(vf, d); // multiply
@@ -130,6 +149,26 @@ namespace ojph {
130149
val = _mm_or_si128(val, sign);
131150
_mm_storeu_si128((__m128i*)dp, val);
132151
}
152+
if (count)
153+
{
154+
__m128 vf = _mm_loadu_ps(p);
155+
vf = _mm_mul_ps(vf, d); // multiply
156+
__m128i val = _mm_cvtps_epi32(vf); // convert to int
157+
__m128i sign = _mm_cmplt_epi32(val, zero); // get sign
158+
val = _mm_xor_si128(val, sign); // negate 1's complement
159+
__m128i ones = _mm_and_si128(sign, one);
160+
val = _mm_add_epi32(val, ones); // 2's complement
161+
162+
__m128i c = _mm_set1_epi32((si32)count);
163+
__m128i idx = _mm_set_epi32(3, 2, 1, 0);
164+
__m128i mask = _mm_cmpgt_epi32(c, idx);
165+
c = _mm_and_si128(val, mask);
166+
tmax = _mm_or_si128(tmax, c);
167+
168+
sign = _mm_slli_epi32(sign, 31);
169+
val = _mm_or_si128(val, sign);
170+
_mm_storeu_si128((__m128i*)dp, val);
171+
}
133172
_mm_storeu_si128((__m128i*)max_val, tmax);
134173
}
135174

@@ -189,7 +228,7 @@ namespace ojph {
189228
__m128i one = _mm_set1_epi64x(1);
190229
__m128i tmax = _mm_loadu_si128((__m128i*)max_val);
191230
__m128i *p = (__m128i*)sp;
192-
for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2)
231+
for ( ; count >= 2; count -= 2, p += 1, dp += 2)
193232
{
194233
__m128i v = _mm_loadu_si128(p);
195234
__m128i sign = _mm_cmplt_epi32(v, zero);
@@ -203,6 +242,24 @@ namespace ojph {
203242
val = _mm_or_si128(val, sign);
204243
_mm_storeu_si128((__m128i*)dp, val);
205244
}
245+
if (count)
246+
{
247+
__m128i v = _mm_loadu_si128(p);
248+
__m128i sign = _mm_cmplt_epi32(v, zero);
249+
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
250+
__m128i val = _mm_xor_si128(v, sign); // negate 1's complement
251+
__m128i ones = _mm_and_si128(sign, one);
252+
val = _mm_add_epi64(val, ones); // 2's complement
253+
sign = _mm_and_si128(sign, m0);
254+
val = _mm_slli_epi64(val, (int)shift);
255+
256+
__m128i c = _mm_set_epi32(0, 0, (si32)0xFFFFFFFF, (si32)0xFFFFFFFF);
257+
c = _mm_and_si128(val, c);
258+
tmax = _mm_or_si128(tmax, c);
259+
260+
val = _mm_or_si128(val, sign);
261+
_mm_storeu_si128((__m128i*)dp, val);
262+
}
206263
_mm_storeu_si128((__m128i*)max_val, tmax);
207264
}
208265

@@ -222,10 +279,10 @@ namespace ojph {
222279
__m128i val = _mm_and_si128(v, m1);
223280
val = _mm_srli_epi64(val, (int)shift);
224281
__m128i sign = _mm_cmplt_epi32(v, zero);
225-
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
226-
val = _mm_xor_si128(val, sign); // negate 1's complement
282+
sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3];
283+
val = _mm_xor_si128(val, sign); // negate 1's complement
227284
__m128i ones = _mm_and_si128(sign, one);
228-
val = _mm_add_epi64(val, ones); // 2's complement
285+
val = _mm_add_epi64(val, ones); // 2's complement
229286
_mm_storeu_si128((__m128i*)p, val);
230287
}
231288
}

src/core/codestream/ojph_codestream_wasm.cpp

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ namespace ojph {
8888
v128_t one = wasm_i32x4_splat(1);
8989
v128_t tmax = wasm_v128_load(max_val);
9090
si32 *p = (si32*)sp;
91-
for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
91+
for ( ; count >= 4; count -= 4, p += 4, dp += 4)
9292
{
9393
v128_t v = wasm_v128_load(p);
9494
v128_t sign = wasm_i32x4_lt(v, zero);
@@ -101,6 +101,25 @@ namespace ojph {
101101
val = wasm_v128_or(val, sign);
102102
wasm_v128_store(dp, val);
103103
}
104+
if (count)
105+
{
106+
v128_t v = wasm_v128_load(p);
107+
v128_t sign = wasm_i32x4_lt(v, zero);
108+
v128_t val = wasm_v128_xor(v, sign); // negate 1's complement
109+
v128_t ones = wasm_v128_and(sign, one);
110+
val = wasm_i32x4_add(val, ones); // 2's complement
111+
sign = wasm_v128_and(sign, m0);
112+
val = wasm_i32x4_shl(val, shift);
113+
114+
v128_t c = wasm_i32x4_splat((si32)count);
115+
v128_t idx = wasm_i32x4_make(0, 1, 2, 3);
116+
v128_t mask = wasm_i32x4_gt(c, idx);
117+
c = wasm_v128_and(val, mask);
118+
tmax = wasm_v128_or(tmax, c);
119+
120+
val = wasm_v128_or(val, sign);
121+
wasm_v128_store(dp, val);
122+
}
104123
wasm_v128_store(max_val, tmax);
105124
}
106125

@@ -117,7 +136,7 @@ namespace ojph {
117136
v128_t one = wasm_i32x4_splat(1);
118137
v128_t tmax = wasm_v128_load(max_val);
119138
float *p = (float*)sp;
120-
for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
139+
for ( ; count >= 4; count -= 4, p += 4, dp += 4)
121140
{
122141
v128_t vf = wasm_v128_load(p);
123142
vf = wasm_f32x4_mul(vf, d); // multiply
@@ -131,6 +150,26 @@ namespace ojph {
131150
val = wasm_v128_or(val, sign);
132151
wasm_v128_store(dp, val);
133152
}
153+
if (count)
154+
{
155+
v128_t vf = wasm_v128_load(p);
156+
vf = wasm_f32x4_mul(vf, d); // multiply
157+
v128_t val = wasm_i32x4_trunc_sat_f32x4(vf); // convert to signed int
158+
v128_t sign = wasm_i32x4_lt(val, zero); // get sign
159+
val = wasm_v128_xor(val, sign); // negate 1's complement
160+
v128_t ones = wasm_v128_and(sign, one);
161+
val = wasm_i32x4_add(val, ones); // 2's complement
162+
163+
v128_t c = wasm_i32x4_splat((si32)count);
164+
v128_t idx = wasm_i32x4_make(0, 1, 2, 3);
165+
v128_t mask = wasm_i32x4_gt(c, idx);
166+
c = wasm_v128_and(val, mask);
167+
tmax = wasm_v128_or(tmax, c);
168+
169+
sign = wasm_i32x4_shl(sign, 31);
170+
val = wasm_v128_or(val, sign);
171+
wasm_v128_store(dp, val);
172+
}
134173
wasm_v128_store(max_val, tmax);
135174
}
136175

@@ -190,7 +229,7 @@ namespace ojph {
190229
v128_t one = wasm_i64x2_splat(1);
191230
v128_t tmax = wasm_v128_load(max_val);
192231
si64 *p = (si64*)sp;
193-
for (ui32 i = 0; i < count; i += 2, p += 2, dp += 2)
232+
for ( ; count >= 2; count -= 2, p += 2, dp += 2)
194233
{
195234
v128_t v = wasm_v128_load(p);
196235
v128_t sign = wasm_i64x2_lt(v, zero);
@@ -203,6 +242,24 @@ namespace ojph {
203242
val = wasm_v128_or(val, sign);
204243
wasm_v128_store(dp, val);
205244
}
245+
if (count)
246+
{
247+
v128_t v = wasm_v128_load(p);
248+
v128_t sign = wasm_i64x2_lt(v, zero);
249+
v128_t val = wasm_v128_xor(v, sign); // negate 1's complement
250+
v128_t ones = wasm_v128_and(sign, one);
251+
val = wasm_i64x2_add(val, ones); // 2's complement
252+
sign = wasm_v128_and(sign, m0);
253+
val = wasm_i64x2_shl(val, shift);
254+
255+
v128_t c = wasm_i32x4_make((si32)0xFFFFFFFF, (si32)0xFFFFFFFF, 0, 0);
256+
c = wasm_v128_and(val, c);
257+
tmax = wasm_v128_or(tmax, c);
258+
259+
val = wasm_v128_or(val, sign);
260+
wasm_v128_store(dp, val);
261+
}
262+
206263
wasm_v128_store(max_val, tmax);
207264
}
208265

src/core/common/ojph_version.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,4 @@
3535

3636
#define OPENJPH_VERSION_MAJOR 0
3737
#define OPENJPH_VERSION_MINOR 21
38-
#define OPENJPH_VERSION_PATCH 1
38+
#define OPENJPH_VERSION_PATCH 2

0 commit comments

Comments
 (0)