Skip to content

Commit

Permalink
*improve SSE4.1 optimizations of class ResizerFloatBilinear (part 7: …
Browse files Browse the repository at this point in the history
…case of large scale, channels = 3).
  • Loading branch information
ermig1979 committed Jan 15, 2025
1 parent 218fa22 commit 51b4815
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 4 deletions.
3 changes: 1 addition & 2 deletions src/Simd/SimdBaseResizerBilinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,10 @@ namespace Simd
ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param)
: Resizer(param)
{
_rowBuf = _param.align < 16 || ((_param.channels < 4 && _param.align > 16) || _param.channels == 3) || _param.dstH >= _param.srcH;
_rowBuf = _param.align < 16 || (_param.channels < 4 && _param.align > 16) || _param.dstH >= _param.srcH;
#if defined(SIMD_ARM_ENABLE) || defined(SIMD_ARM64_ENABLE)
_rowBuf = true;
#endif

_ay.Resize(_param.dstH, false, _param.align);
_iy.Resize(_param.dstH, false, _param.align);
EstimateIndexAlpha(_param, _param.srcH, _param.dstH, 1, 1, _iy.data, _ay.data);
Expand Down
27 changes: 26 additions & 1 deletion src/Simd/SimdSse41ResizerBilinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ namespace Simd
void ResizerFloatBilinear::Run(const float* src, size_t srcStride, float* dst, size_t dstStride)
{
size_t cn = _param.channels, cnF = AlignLo(cn, F), cnT = cn - cnF, cnL = cnT - F;
size_t dw = _param.dstW, dw2= AlignLo(dw, 2), dw4 = AlignLo(dw, 4);
size_t dw = _param.dstW, dw2 = AlignLo(dw, 2), dw4 = AlignLo(dw, 4), dw1 = dw - 1;
__m128 _1 = _mm_set1_ps(1.0f);
if (_rowBuf)
{
Expand Down Expand Up @@ -770,6 +770,31 @@ namespace Simd
StoreHalf<0>(dst + od, _mm_add_ps(_mm_mul_ps(r0, fy0), _mm_mul_ps(r1, fy1)));
}
}
else if (cn == 3)
{
size_t dx = 0, od = 0;
for (; dx < dw1; dx += 1, od += 3)
{
size_t os = _ix[dx];
__m128 fx1 = _mm_set1_ps(_ax[dx]);
__m128 fx0 = _mm_sub_ps(_1, fx1);
__m128 r0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + os), fx0), _mm_mul_ps(_mm_loadu_ps(src0 + os + 3), fx1));
__m128 r1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + os), fx0), _mm_mul_ps(_mm_loadu_ps(src1 + os + 3), fx1));
_mm_storeu_ps(dst + od, _mm_add_ps(_mm_mul_ps(r0, fy0), _mm_mul_ps(r1, fy1)));
}
if (dx < dw)
{
size_t os = _ix[dx];
__m128 fx1 = _mm_set1_ps(_ax[dx]);
__m128 fx0 = _mm_sub_ps(_1, fx1);
for (size_t ed = od + 3; od < ed; od++, os++)
{
__m128 r0 = _mm_add_ps(_mm_mul_ps(_mm_load_ss(src0 + os), fx0), _mm_mul_ps(_mm_load_ss(src0 + os + 3), fx1));
__m128 r1 = _mm_add_ps(_mm_mul_ps(_mm_load_ss(src1 + os), fx0), _mm_mul_ps(_mm_load_ss(src1 + os + 3), fx1));
_mm_store_ss(dst + od, _mm_add_ps(_mm_mul_ps(r0, fy0), _mm_mul_ps(r1, fy1)));
}
}
}
else
{
for (size_t dx = 0; dx < dw; dx++)
Expand Down
2 changes: 1 addition & 1 deletion src/Test/TestResize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ namespace Test
bool ResizerAutoTest(const FuncRS & f1, const FuncRS & f2)
{
bool result = true;
result = result && ResizerAutoTest(SimdResizeMethodBilinear, SimdResizeChannelFloat, 2, f1, f2);

#if 1
#if defined(SIMD_X64_ENABLE)
result = result && ResizerAutoTest(SimdResizeMethodBilinear, SimdResizeChannelFloat, 64, f1, f2);
Expand Down

0 comments on commit 51b4815

Please sign in to comment.